#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see #The function paste_processed_data pastes data of different epigeentic marks (for one cell line and one gene type) together in one data frame paste_processed_data = function(paths,mark_names,blacklist_path=NULL,dna_normalization_vec=NULL,cpg_path = NULL,quantile_cutoff=0.01){ #paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for a specific cell line and gene type, so it might look like #c("/home/test_TSS_1.csv","/home/test2_TSS_1.csv") or #c("/home/test_TSS_40.csv","/home/test2_TSS_40.csv") or #c("/home/test_TTS_1.csv","/home/test2_TTS_1.csv") #mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so #if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test_TSS_1.csv" corresponds to H3K4me3 an so on #optional parameters: #blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt" #if no blacklist_path is specified, then we assume no genes are desired to become deleted #(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly) #dna_normalization_vec is the vector that specifies which epigenetic mark we do want to normalized with respect to the number of CpGs #if specified, it has to give the indices in mark mark_names (and paths) that should be normalized #so if dna_normalization_vec = 3, it would mean that everything stemming from paths[3] has to be normalized #cpg_path is the path to the blacklist (the output of counting_the_number_of_cpgs.py), so for example "/home/protein_coding_TSS_1.csv" #must be specified if dna_normalization_vec #quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1, #the bottom 1% are set to 0 and the rest is scaled linearly. If in the unusual cases where the cutoff for the top 1% and bottom 1% is identical, we set all data for that mark to 0. #if we have a CpG normalization that is done beforehand #OUTPUT: #if on 1-bin resolution #a data frame, where the (i,j)-th entry corresponds to the enrichment at loci i of mark j possibly normalized by the number of CpGs #the column names is the vector mark_names, #the row names is the bin names of the loaded csv files #if a higher resolution is used, for instance, 40-bin #then to each epigenetic mark we do have 40 columns with the obvious meaning genes_out = NULL if(!is.null(blacklist_path)) genes_out = array(unlist(read.table(blacklist_path, quote="\""))) all_genes = array(unlist(read.csv(paths[1])[,1])) remove_order = match(genes_out,all_genes) if(length(remove_order)>0){ gene_names_here = all_genes[-remove_order] }else{ gene_names_here = all_genes } if(!is.null(dna_normalization_vec)){ dna_meth_data = read.csv(cpg_path) if(length(remove_order)>0){ dna_meth_data = dna_meth_data[-remove_order,2:ncol(dna_meth_data)] }else{ dna_meth_data = dna_meth_data[,2:ncol(dna_meth_data)] } dna_meth_data = as.matrix(dna_meth_data) } for(i in 1:length(paths)){ attach_frame = read.csv(paths[i]) if(i==1) pasted_data = matrix(0,length(gene_names_here),length(paths)*(ncol(attach_frame)-1)) if(length(remove_order)>0){ v = attach_frame[-remove_order,2:ncol(attach_frame)] }else{ v = attach_frame[,2:ncol(attach_frame)] } v = as.matrix(v) if(i %in% dna_normalization_vec){ v = v/dna_meth_data v[union(which(is.infinite(v)),which(is.nan(v)))] = 0 } q1=quantile(v,quantile_cutoff) q2=quantile(v,1-quantile_cutoff) v[vq2]=q2 v = v-q1 if(q2>q1){ v = v/(q2-q1) }else{ v = rep(0,length(v)) } pasted_data[,(i-1)*ncol(v)+1:ncol(v)] = v } pasted_data = data.frame(pasted_data) rownames(pasted_data) = gene_names_here mark_names_here = mark_names if(ncol(attach_frame)>2) mark_names_here = c(t(outer(mark_names,1:(ncol(attach_frame)-1),FUN=paste ,sep="_"))) colnames(pasted_data) = mark_names_here return(pasted_data) } #The function combine_cage_data adds the plus and minus tag information for Cage together, adds a small pseudo_count and tages the logarithm combine_cage_data = function(paths,mark_names,blacklist_path=NULL,pseudo_count=0.01){ #paths is the vector of paths to the csv files genereated by BEDGRAPH-processing.py and/or WIG-processing.py for the Cage plus and minus strand (always TSS and a fixed gene type), so it might look like #c("/home/cage_plus_TSS_1.csv","/home/cage_minus_TSS_1.csv") or #c("/home/cage_plus_TSS_40.csv","/home/cage_minus_TSS_40.csv") #optional parameters: #blacklist_path is the path to the blacklist (the output of remove_genes_from_X_and_Y.py), so for example "/home/protein_coding_Not_to_be_considered.txt" #if no blacklist_path is specified, then we assume no genes are desired to become deleted #(suggested not to do so, as gonosomal genes and mitochondrial genes can change the measurements significantly) #pseudo_count is the small pseudo cou-nt that is added to each entry for each gene before taking the logarithm #that is necessary as some values might be zero before taking the logarithm #PROCEDURE AND OUTPUT: #if the bin resolution is odd (for instance, 1-bin resolution): #we load the plus and the minus strand csv file, respectively #remove the genes from the blacklist and we take the middle bin information for each gene for both plus and minus strand information #we add them, add the pseudo_count and take the logarithm to base 10 #the output is the vector where the i-th entry corresponds to the gene and the value is the Cage gene expression value by the above method #if the bin resolution is even (for instance, 40-bin resolution): #identical as if the resolution is odd, #only that we de not take the middle bin information for each gene for both plus and minus strand information #but the bin left and right from the middle for each gene for both plus and minus strand information (20th and 21st bin on 40-bin-resolution) and add them #then we proceed as outlined above and the output is identical genes_out = NULL if(!is.null(blacklist_path)) genes_out = array(unlist(read.table(blacklist_path, quote="\""))) all_genes = array(unlist(read.csv(paths[1])[,1])) remove_order = match(genes_out,all_genes) if(length(remove_order)>0){ gene_names_here = all_genes[-remove_order] }else{ gene_names_here = all_genes } dataset = matrix(0,length(gene_names_here),2) for(i in 1:2){ attach_frame = read.csv(paths[i]) if(length(remove_order)>0){ attach_frame = attach_frame[-remove_order,2:ncol(attach_frame)] }else{ attach_frame = attach_frame[,2:ncol(attach_frame)] } if(is.null(ncol(attach_frame))){ dataset[,i] = attach_frame }else{ if(ncol(attach_frame)%%2 == 0){ dataset[,i] = attach_frame[,ncol(attach_frame)/2]+attach_frame[,ncol(attach_frame)/2+1] }else{ dataset[,i] = attach_frame[,(ncol(attach_frame)+1)/2] } } } dataset = rowSums(dataset) dataset = dataset+pseudo_count dataset = log(dataset) names(dataset) = gene_names_here return(dataset) }