#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program. If not, see
#The function paste_processed_data pastes data of different epigeentic marks (for one cell line and one gene type) together in one data frame
paste_processed_data = function(paths,mark_names,blacklist_path=NULL,dna_normalization_vec=NULL,cpg_path = NULL,quantile_cutoff=0.01){
#paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for a specific cell line and gene type, so it might look like
#c("/home/test_TSS_1.csv","/home/test2_TSS_1.csv") or
#c("/home/test_TSS_40.csv","/home/test2_TSS_40.csv") or
#c("/home/test_TTS_1.csv","/home/test2_TTS_1.csv")
#mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so
#if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test_TSS_1.csv" corresponds to H3K4me3 an so on
#optional parameters:
#blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt"
#if no blacklist_path is specified, then we assume no genes are desired to become deleted
#(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly)
#dna_normalization_vec is the vector that specifies which epigenetic mark we do want to normalized with respect to the number of CpGs
#if specified, it has to give the indices in mark mark_names (and paths) that should be normalized
#so if dna_normalization_vec = 3, it would mean that everything stemming from paths[3] has to be normalized
#cpg_path is the path to the blacklist (the output of counting_the_number_of_cpgs.py), so for example "/home/protein_coding_TSS_1.csv"
#must be specified if dna_normalization_vec
#quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1,
#the bottom 1% are set to 0 and the rest is scaled linearly. If in the unusual cases where the cutoff for the top 1% and bottom 1% is identical, we set all data for that mark to 0.
#if we have a CpG normalization that is done beforehand
#OUTPUT:
#if on 1-bin resolution
#a data frame, where the (i,j)-th entry corresponds to the enrichment at loci i of mark j possibly normalized by the number of CpGs
#the column names is the vector mark_names,
#the row names is the bin names of the loaded csv files
#if a higher resolution is used, for instance, 40-bin
#then to each epigenetic mark we do have 40 columns with the obvious meaning
genes_out = NULL
if(!is.null(blacklist_path)) genes_out = array(unlist(read.table(blacklist_path, quote="\"")))
all_genes = array(unlist(read.csv(paths[1])[,1]))
remove_order = match(genes_out,all_genes)
if(length(remove_order)>0){
gene_names_here = all_genes[-remove_order]
}else{
gene_names_here = all_genes
}
if(!is.null(dna_normalization_vec)){
dna_meth_data = read.csv(cpg_path)
if(length(remove_order)>0){
dna_meth_data = dna_meth_data[-remove_order,2:ncol(dna_meth_data)]
}else{
dna_meth_data = dna_meth_data[,2:ncol(dna_meth_data)]
}
dna_meth_data = as.matrix(dna_meth_data)
}
for(i in 1:length(paths)){
attach_frame = read.csv(paths[i])
if(i==1) pasted_data = matrix(0,length(gene_names_here),length(paths)*(ncol(attach_frame)-1))
if(length(remove_order)>0){
v = attach_frame[-remove_order,2:ncol(attach_frame)]
}else{
v = attach_frame[,2:ncol(attach_frame)]
}
v = as.matrix(v)
if(i %in% dna_normalization_vec){
v = v/dna_meth_data
v[union(which(is.infinite(v)),which(is.nan(v)))] = 0
}
q1=quantile(v,quantile_cutoff)
q2=quantile(v,1-quantile_cutoff)
v[vq2]=q2
v = v-q1
if(q2>q1){
v = v/(q2-q1)
}else{
v = rep(0,length(v))
}
pasted_data[,(i-1)*ncol(v)+1:ncol(v)] = v
}
pasted_data = data.frame(pasted_data)
rownames(pasted_data) = gene_names_here
mark_names_here = mark_names
if(ncol(attach_frame)>2) mark_names_here = c(t(outer(mark_names,1:(ncol(attach_frame)-1),FUN=paste ,sep="_")))
colnames(pasted_data) = mark_names_here
return(pasted_data)
}
#The function combine_cage_data adds the plus and minus tag information for Cage together, adds a small pseudo_count and tages the logarithm
combine_cage_data = function(paths,mark_names,blacklist_path=NULL,pseudo_count=0.01){
#paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for the Cage plus and minus strand (always TSS and a fixed gene type), so it might look like
#c("/home/cage_plus_TSS_1.csv","/home/cage_minus_TSS_1.csv") or
#c("/home/cage_plus_TSS_40.csv","/home/cage_minus_TSS_40.csv")
#optional parameters:
#blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt"
#if no blacklist_path is specified, then we assume no genes are desired to become deleted
#(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly)
#pseudo_count is the small pseudo cou-nt that is added to each entry for each gene before taking the logarithm
#that is necessary as some values might be zero before taking the logarithm
#PROCEDURE AND OUTPUT:
#if the bin resolution is odd (for instance, 1-bin resolution):
#we load the plus and the minus strand csv file, respectively
#remove the genes from the blacklist and we take the middle bin information for each gene for both plus and minus strand information
#we add them, add the pseudo_count and take the logarithm to base 10
#the output is the vector where the i-th entry corresponds to the gene and the value is the Cage gene expression value by the above method
#if the bin resolution is even (for instance, 40-bin resolution):
#identical as if the resolution is odd,
#only that we de not take the middle bin information for each gene for both plus and minus strand information
#but the bin left and right from the middle for each gene for both plus and minus strand information (20th and 21st bin on 40-bin-resolution) and add them
#then we proceed as outlined above and the output is identical
genes_out = NULL
if(!is.null(blacklist_path)) genes_out = array(unlist(read.table(blacklist_path, quote="\"")))
all_genes = array(unlist(read.csv(paths[1])[,1]))
remove_order = match(genes_out,all_genes)
if(length(remove_order)>0){
gene_names_here = all_genes[-remove_order]
}else{
gene_names_here = all_genes
}
dataset = matrix(0,length(gene_names_here),2)
for(i in 1:2){
attach_frame = read.csv(paths[i])
if(length(remove_order)>0){
attach_frame = attach_frame[-remove_order,2:ncol(attach_frame)]
}else{
attach_frame = attach_frame[,2:ncol(attach_frame)]
}
if(is.null(ncol(attach_frame))){
dataset[,i] = attach_frame
}else{
if(ncol(attach_frame)%%2 == 0){
dataset[,i] = attach_frame[,ncol(attach_frame)/2]+attach_frame[,ncol(attach_frame)/2+1]
}else{
dataset[,i] = attach_frame[,(ncol(attach_frame)+1)/2]
}
}
}
dataset = rowSums(dataset)
dataset = dataset+pseudo_count
dataset = log(dataset)
names(dataset) = gene_names_here
return(dataset)
}