#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see args<-commandArgs(TRUE) #Example call of this file: Rscript Enrichments_per_bin_processing.R /home/input.bg /home/output.RData BSgenome.Hsapiens.UCSC.hg19 4 1000 chr1 chr2 chr3 #This file reads in a bedgraph type file (see 1st and 4th argument for further details) and #outputs a RData file containing a matrix with column headers "Tile_name" "Score" and in each row we have a row name/bin name and #a score value of enrichments of the input file that fall into that region (see 2nd argument for further details) #for each chromosome that should be considered (see 6th to nth argument for further details) in the respective annotation (see 3rd argument for further details) #we tile the chromosome into tile size bp chunks (see 5th argument for further details) starting from #bp 1 to bp tile size, from bp (tile size+1) to 2*tile size, etc., until we reach the end and a possible remainder at the end of each chromosome #with less than tile size bps is not considered #the entity of these bins at all chromosomes that should be considered is making up the entity of all bins for which we output and count enrichments falling into these bins #1st argument: Path to file that should be processed read_file = args[1] #e.g. /home/input.bg #This program is only intended to work if the lines in the file looks as follows (bedgraph format style): #Every line looks like (tab-delimited) #chromosome start end score, so for instance #chr1 1 1000 4 #chr1 500 2000 13 #or alternatively like (tab-delimited) #chromosome start end, so for instance #chr1 1 200 #chr1 50 250 #in which case we assume the value to be 1 (might make sense for aligned reads, when each line represents a read) #The input has to be consistently one of the two (so not switching between both types)! #Also headers have to be removed! #2nd argument: Path where the processed data should be stored (RData) store_file = args[2] #e.g. /home/output.RData #3rd argument: Which genome annotation to use genome_annotation = args[3] #e.g. #BSgenome.Hsapiens.UCSC.hg19, #BSgenome.Mmusculus.UCSC.mm10 or #BSgenome.Scerevisiae.UCSC.sacCer3 #4th argument: no of columns to consider (3 or 4) no_cols = as.numeric(args[4]) #if the input file looks like #chromosome start end value, so for instance #chr1 1 1000 4 #chr1 500 2000 13 #then no_cols is 4 #or alternatively like #chromosome start end, so for instance #chr1 1 200 #chr1 50 250 #then no_cols is 3 #5th argument: bin size in bps tile_size = as.numeric(args[5]) #e.g. #1000 or 4000 #6th, 7th, 8th, ... arguments: chromosome(s) to consider chr_to_consider = args[6:length(args)] #e.g. chr1 chr2 chr3 ... (can be of arbitrary length, but it is probably best due the number of chromosomes just to focus on autosomes) library("GenomicRanges") library(genome_annotation,character.only=TRUE) chr_length = seqlengths(get(genome_annotation)) chr_length = chr_length[which(names(chr_length) %in% chr_to_consider)] detach(paste0("package:",genome_annotation), unload=TRUE,character.only=TRUE) #chromosome lengths without overhang at the end chr_length = tile_size*floor(chr_length/tile_size) #file which is used to calculate the enrichments per bin #first column: chromosome names #second column: start bp of bin #third column: end bp of bin enrichments_per_bin <- array(0,dim=c(sum(chr_length)/tile_size,3)) counter = 0 for(i in 1:length(chr_length)){ enrichments_per_bin[(counter+1):(counter+(chr_length[i]/tile_size)),1] = names(chr_length)[i] enrichments_per_bin[(counter+1):(counter+(chr_length[i]/tile_size)),2] = tile_size*(0:((chr_length[i]/tile_size)-1))+1 enrichments_per_bin[(counter+1):(counter+(chr_length[i]/tile_size)),3] = tile_size*(1:(chr_length[i]/tile_size)) counter = counter+(chr_length[i]/tile_size) } #create dataset array where the enrichments for each bin a stored dataset = array(0,dim = nrow(enrichments_per_bin)) #name each bin in dataset sensibly names(dataset) = paste(enrichments_per_bin[,1],enrichments_per_bin[,2],enrichments_per_bin[,3],sep="_") #convert enrichments_per_bin into a GRanges object enrichments_per_bin = GRanges(seqnames = Rle(enrichments_per_bin[,1]), ranges = IRanges(as.numeric(enrichments_per_bin[,2]), end = as.numeric(enrichments_per_bin[,3])), strand = Rle(strand(rep("*",nrow(enrichments_per_bin))))) #loop through the input file and consider for each line overlaps with the individual bins #then add to each enrichment of such a bin the enrichment score of the line times number of base pairs of overlap (which is done in the variable dataset) infile <- file(read_file, open="r") chunk_size = 2*10^6 repeat{ X <-read.delim(infile, header = FALSE, nrows=chunk_size) if(no_cols==3){ gr = GRanges(seqnames = Rle(X[,1]), ranges = IRanges(X[,2], end = X[,3]-1), strand = Rle(strand(rep("*",nrow(X)))),score=array(1,dim=nrow(X))) }else{ gr = GRanges(seqnames = Rle(X[,1]), ranges = IRanges(X[,2], end = X[,3]-1), strand = Rle(strand(rep("*",nrow(X)))),score=X[,4]) } match <- findOverlaps(enrichments_per_bin,gr) gr.over <- pintersect(enrichments_per_bin[queryHits(match)],gr[subjectHits(match)]) score_to_add = width(gr.over) score_to_add = score_to_add* gr$score[subjectHits(match)] score_to_add = by(score_to_add,queryHits(match),sum) dataset[as.numeric(names(score_to_add))] = dataset[as.numeric(names(score_to_add))]+score_to_add if (nrow(X) < chunk_size) break } #save dataset in RData file save(dataset,file=store_file)