#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program.  If not, see <http://www.gnu.org/licenses/>


#The function paste_processed_data pastes data of different epigeentic marks (for one cell line and one gene type) together in one data frame
paste_processed_data = function(paths,mark_names,blacklist_path=NULL,dna_normalization_vec=NULL,cpg_path = NULL,quantile_cutoff=0.01){
  #paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for a specific cell line and gene type, so it might look like
  #c("/home/test_TSS_1.csv","/home/test2_TSS_1.csv") or 
  #c("/home/test_TSS_40.csv","/home/test2_TSS_40.csv") or
  #c("/home/test_TTS_1.csv","/home/test2_TTS_1.csv")
  
  #mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so
  #if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test_TSS_1.csv" corresponds to H3K4me3 an so on
  
  
  #optional parameters:
  
  #blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt"
  #if no blacklist_path is specified, then we assume no genes are desired to become deleted 
  #(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly)
  
  #dna_normalization_vec is the vector that specifies which epigenetic mark we do want to normalized  with respect to the number of CpGs
  #if specified, it has to give the indices in mark mark_names (and paths) that should be normalized
  #so if dna_normalization_vec = 3, it would mean that everything stemming from paths[3] has to be normalized
  
  #cpg_path is the path to the blacklist (the output of counting_the_number_of_cpgs.py), so for example "/home/protein_coding_TSS_1.csv"
  #must be specified if dna_normalization_vec

  #quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1,
  #the bottom 1% are set to 0 and the rest is scaled linearly. If in the unusual cases where the cutoff for the top 1% and bottom 1% is identical, we set all data for that mark to 0.
  #if we have a CpG normalization that is done beforehand
  
  #OUTPUT:
  #if on 1-bin resolution
  #a data frame, where the (i,j)-th entry corresponds to the enrichment at loci i of mark j possibly normalized by the number of CpGs
  #the column names is the vector mark_names,
  #the row names is the bin names of the loaded csv files
  
  #if a higher resolution is used, for instance, 40-bin
  #then to each epigenetic mark we do have 40 columns with the obvious meaning
  
  genes_out = NULL
  if(!is.null(blacklist_path))  genes_out = array(unlist(read.table(blacklist_path, quote="\"")))
  all_genes = array(unlist(read.csv(paths[1])[,1]))
  remove_order = match(genes_out,all_genes)
  if(length(remove_order)>0){
    gene_names_here = all_genes[-remove_order]
  }else{
    gene_names_here = all_genes
  }
  
  if(!is.null(dna_normalization_vec)){                                
    dna_meth_data = read.csv(cpg_path)
    if(length(remove_order)>0){
      dna_meth_data = dna_meth_data[-remove_order,2:ncol(dna_meth_data)]
    }else{
      dna_meth_data = dna_meth_data[,2:ncol(dna_meth_data)]
    }
    dna_meth_data = as.matrix(dna_meth_data)
  }
  
  
  for(i in 1:length(paths)){
    attach_frame = read.csv(paths[i])
    if(i==1) pasted_data = matrix(0,length(gene_names_here),length(paths)*(ncol(attach_frame)-1))
    if(length(remove_order)>0){
      v = attach_frame[-remove_order,2:ncol(attach_frame)]
    }else{
      v = attach_frame[,2:ncol(attach_frame)]
    }
    v = as.matrix(v)
    if(i %in% dna_normalization_vec){
      v = v/dna_meth_data
      v[union(which(is.infinite(v)),which(is.nan(v)))] = 0
    }

    q1=quantile(v,quantile_cutoff)
    q2=quantile(v,1-quantile_cutoff)
    v[v<q1]=q1
    v[v>q2]=q2
    v = v-q1
    if(q2>q1){
      v = v/(q2-q1)
    }else{
      v = rep(0,length(v))
    }

    pasted_data[,(i-1)*ncol(v)+1:ncol(v)] = v

  }
  pasted_data = data.frame(pasted_data)
  rownames(pasted_data) = gene_names_here
  
  mark_names_here = mark_names
  if(ncol(attach_frame)>2) mark_names_here = c(t(outer(mark_names,1:(ncol(attach_frame)-1),FUN=paste ,sep="_")))
  
  colnames(pasted_data) = mark_names_here
  return(pasted_data)
}


#The function combine_cage_data adds the plus and minus tag information for Cage together, adds a small pseudo_count and tages the logarithm
combine_cage_data = function(paths,mark_names,blacklist_path=NULL,pseudo_count=0.01){
  #paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for the Cage plus and minus strand (always TSS and a fixed gene type), so it might look like
  #c("/home/cage_plus_TSS_1.csv","/home/cage_minus_TSS_1.csv") or 
  #c("/home/cage_plus_TSS_40.csv","/home/cage_minus_TSS_40.csv")
  
  #optional parameters:
  
  #blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt"
  #if no blacklist_path is specified, then we assume no genes are desired to become deleted 
  #(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly)
  
  #pseudo_count is the small pseudo cou-nt that is added to each entry for each gene before taking the logarithm
  #that is necessary as some values might be zero before taking the logarithm
  
  #PROCEDURE AND OUTPUT:
  #if the bin resolution is odd (for instance, 1-bin resolution):
  #we load the plus and the minus strand csv file, respectively
  #remove the genes from the blacklist and we take the middle bin information for each gene for both plus and minus strand information
  #we add them, add the pseudo_count and take the logarithm to base 10
  #the output is the vector where the i-th entry corresponds to the gene and the value is the Cage gene expression value by the above method
  
  #if the bin resolution is even (for instance, 40-bin resolution):
  #identical as if the resolution is odd,
  #only that we de not take the middle bin information for each gene for both plus and minus strand information
  #but the bin left and right from the middle for each gene for both plus and minus strand information (20th and 21st bin on 40-bin-resolution) and add them
  #then we proceed as outlined above and the output is identical
  
  genes_out = NULL
  if(!is.null(blacklist_path))  genes_out = array(unlist(read.table(blacklist_path, quote="\"")))
  all_genes = array(unlist(read.csv(paths[1])[,1]))
  remove_order = match(genes_out,all_genes)
  if(length(remove_order)>0){
    gene_names_here = all_genes[-remove_order]
  }else{
    gene_names_here = all_genes
  }
  
  dataset = matrix(0,length(gene_names_here),2)
  for(i in 1:2){
    attach_frame = read.csv(paths[i])
    if(length(remove_order)>0){
      attach_frame = attach_frame[-remove_order,2:ncol(attach_frame)]
    }else{
      attach_frame = attach_frame[,2:ncol(attach_frame)]
    }
    
    if(is.null(ncol(attach_frame))){
      dataset[,i] = attach_frame
    }else{
      if(ncol(attach_frame)%%2 == 0){
        dataset[,i] = attach_frame[,ncol(attach_frame)/2]+attach_frame[,ncol(attach_frame)/2+1]
      }else{
        dataset[,i] = attach_frame[,(ncol(attach_frame)+1)/2]
      }
    }
  }
  
  dataset = rowSums(dataset)
  dataset = dataset+pseudo_count
  dataset = log(dataset)
  names(dataset) = gene_names_here
  return(dataset)
}