#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see #This program is only intended to work if the bedGraph looks as follows (which should be the case at least for (most of) the ENCODE data): #Every line looks like #chromosome start end value, so for instance #chr1 1 1000 4 import sys import itertools import numpy #path where the annotation csv file (the output from GTF_processing.py) can be found # e.g. "/home/user/Transcripts-v.18.csv" csv_path = sys.argv[1] gtffile = open(csv_path) #path where the file, that has to be processed, can be found # e.g. "/home/user/example.BedGraph" processing_file = sys.argv[2] #type ('lincRNA' or 'protein_coding' etc.) type = sys.argv[3] #halfwinwifth is the range, e.g., +/- halfwinwidth around the TSS/TTS..., suggested value 2000 halfwinwidth = int(sys.argv[4]) #granulationwin is the vector for numbers of windows the area around the TSS/TTS or the entire transcript is split into..., suggested 1 and 40, Input: "1,40" granulationwin_sys = sys.argv[5] granulationwin_sys = granulationwin_sys.split(",") granulationwin = list() for win_here in granulationwin_sys: granulationwin.append(int(win_here)) granulationwin = numpy.asarray(granulationwin) granulationwin_cum_sum = numpy.cumsum(granulationwin) granulationwin_cum_sum_extended = numpy.append(0,granulationwin_cum_sum) sum_granu = sum(granulationwin) #name under which the profile and region count data should be stored (corresponds to input no. 8 in the manual) store_names = sys.argv[8] #Folder to store profile matrix, as sys.argv input you may take, e.g., "/home/user/" sysimport_csv = sys.argv[6]+store_names #Folder to store CpG count matrices, as sys.argv input you may take, e.g., "/home/user/" sysimport_profile = sys.argv[7]+store_names #optional: store in file whether or not the file format has been ok for our purposes - works at least for BigWig ENCODE file that have been transformed to BedGraph via bigWigToBedGraph UCSC tool #input example: "/home/user/correct_type.txt" if len(sys.argv)>= 10: sysimport_correct_type = sys.argv[9] pos = list() poschr = list() # The following function outputs a list with two components which are lists themselves: #one (poschr) has as entries all chromosomes so something like 'chr1', 'chr2' etc. #the other one (pos) has as entries the TSS and TTS for the considered considered transcript plus strand information and gene ID # so something like [1000,2000,'+','ENSGXXXXXXXX.X'],[20000,3000,'-','ENSGXXXXXXXXXX.X'] etc. #we consider the outmost TSS/TTS for all transcripts for one gene ID (which of course depends on the strand info, #so for '+' for take the minimum of all TSS for all transcripts for one gene ID and max for '-') and if False one takes all transcripts individually #this is because then the profiles of histone modifications etc. get as clear as possible since the overlapping profiles of different, say, TSSs for one gene don't affect each other so much then #'transcript_type' may as values 'lncRNA' or 'protein_coding' etc. #gtffile is clear from above def transcriptprocessing(transcript_type,gtffile): pos = list() poschr = list() currentgeneid = '0' currentstartval = -1 currentendval = -1 currentchr = '0' strand = '0' currenttype = '0' for line in gtffile: z = line.split(",") if z[0]==currentgeneid: if z[2]==transcript_type: if strand == "+": currentstartval = min(currentstartval,int(z[4]),int(z[5])) currentendval = max(currentendval,int(z[4]),int(z[5])) if strand == "-": currentstartval = max(currentstartval,int(z[4]),int(z[5])) currentendval = min(currentendval,int(z[4]),int(z[5])) else: if currenttype==transcript_type: if currentgeneid != '0': if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) currentchr = z[3] currenttype = z[2] currentgeneid = z[0] if z[6] == "+\r\n": strand = '+' currentstartval = min(int(z[4]),int(z[5])) currentendval = max(int(z[4]),int(z[5])) if z[6] == "-\r\n": strand = '-' currentstartval = max(int(z[4]),int(z[5])) currentendval = min(int(z[4]),int(z[5])) if currenttype==transcript_type: if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) return [poschr,pos] #establishing poschr and pos [poschr,pos] = transcriptprocessing(type,gtffile) #check if all chromosomes (including chrM) are in is pochr and pos #if not we will add everything including a "fake" gene, which will not be stored chr_not_to_be_stored = list() control_poschr_list = list(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']) for chr in control_poschr_list: if chr not in poschr: poschr.append(chr) chr_not_to_be_stored.append(chr) newlist = list() newlist.append([1,10,'+','to_be_deleted']) pos.append(newlist) #splitting the lines up into TSS/Transcript/TTS regions for i in range(len(poschr)): newlist = list() for z in pos[i]: if z[2]=='+': newlist.append([int(z[0])-halfwinwidth,int(z[0])+halfwinwidth,'TSS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth,int(z[1])+halfwinwidth,'TTS',z[2],z[3]]) if z[2]=='-': newlist.append([int(z[0])-halfwinwidth+1,int(z[0])+halfwinwidth+1,'TTS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth+1,int(z[1])+halfwinwidth+1,'TSS',z[2],z[3]]) pos[i]=newlist #we sort the TSS/TTS/Gene bodies with regards to the position on the chromosome from operator import itemgetter for i in range(len(poschr)): pos[i] = sorted(pos[i], key=itemgetter(0)) #in essence this function just says that to each chr i and k*n what is the lowest index j such that the minimum position of the item in the list pos[i] for chr i and with position j in this list #has a chromosome position >= k*n def granulationpointer(x,n): out = list() for i in range(len(x)): outi = list() z = 0 for j in range(len(x[i])): while z<=int(x[i][j][0]): outi.append(j) z += n out.append(numpy.asarray(outi)) return out #The following function generates a vector for each length and times input, such that it has a entries zero and equidistributed further entries which together split the vector #[0,1,...,length] into (nearly) equal windows def splitting(length,times,orientation): x = numpy.arange(0,length,int(numpy.ceil(float(length)/float(times)))) while len(x)<(times+1): x = numpy.append(x,length) if orientation == '-': x = numpy.array((times+1)*[length])-x x = x[::-1] return x #Itemvecs has in the end to each chr and each TSS/TTS/... a vector with granulationwin vectors of length granulationwin[k] to which we map potential enrichment value in the respective window #splitting_vec gives the granulation windows indices for each item of interest Itemvecs = list() splitting_vec = list() for i in range(len(poschr)): itemveci = list() splitting_vec_i = list() for j in range(len(pos[i])): itemvecj = numpy.zeros(sum_granu,dtype='float32') splitting_vec_j = numpy.array([],dtype='int32') for k in range(len(granulationwin)): splitting_vec_j = numpy.append(splitting_vec_j,splitting(pos[i][j][1]-pos[i][j][0],granulationwin[k],pos[i][j][3])) splitting_vec_i.append(numpy.asarray(splitting_vec_j)) itemveci.append(itemvecj) Itemvecs.append(numpy.asarray(itemveci)) splitting_vec.append(numpy.asarray(splitting_vec_i)) #connect to data file a = open(processing_file) npointer = 3*(10 ** 6) gpointer = granulationpointer(pos,npointer) lengthgpointer = list() for i in range(len(poschr)): lengthgpointer.append(len(gpointer[i])) lengthgpointer = numpy.asarray(lengthgpointer) breaklength = 100000 lastchr = '0' lastpos = -1 oldperformchr = '0' itemposchrindex = 0 lengpointer = 0 itemposperform = 0 gpointervec = 0 currentitemvecpos = 0 current_splitting_vec = 0 #what comes now is in essence: #go through the processing file #collect several (at most breaklength) values in a certain region #use gpointer resp. gpointervec to figure out the potential TSS/TTS/... that may intersect with these enrichment calls #if there is an overlap map them to the respective vector in Itemvecs n=0 scorevec_start = -1 scorevec_end = -1 scorevec_chr = '0' scorevec_chr_proc = 0 scorevec_proc = 0 scorevec = list() scorevec_proc = numpy.asarray([]) processing_file_type = list() processing_file_frequency = list() processing_file_firstocurrence = list() for line in a: n += 1 z_line = line.split() if z_line[0] in processing_file_type: processing_file_frequency[processing_file_type.index(z_line[0])] = processing_file_frequency[processing_file_type.index(z_line[0])]+1 else: processing_file_type.append(z_line[0]) processing_file_firstocurrence.append(n) processing_file_frequency.append(1) if 'chr' not in z_line[0]: continue region = z_line[0] position_start = int(z_line[1]) position_end = int(z_line[2]) value = float(z_line[3]) if scorevec_chr != region: scorevec_chr_proc = scorevec_chr scorevec_chr = region scorevec_proc = numpy.asarray(scorevec) scorevec = list() if (position_start +1)==position_end: scorevec.append(value) else: scorevec.extend((position_end-position_start)*[value]) posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = position_start scorevec_end = position_end else: if position_start>=scorevec_end: scorevec.extend(numpy.zeros(position_start-scorevec_end,dtype='float32')) if (position_start +1)==position_end: scorevec.append(value) else: scorevec.extend((position_end-position_start)*[value]) scorevec_end = position_end else: scorevec_chr_proc = scorevec_chr scorevec_chr = region scorevec_proc = numpy.asarray(scorevec) scorevec = list() if (position_start +1)==position_end: scorevec.append(value) else: scorevec.extend((position_end-position_start)*[value]) posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = position_start scorevec_end = position_end if len(scorevec)>=breaklength: scorevec_chr_proc = scorevec_chr scorevec_chr = scorevec_chr scorevec_proc = numpy.asarray(scorevec) scorevec = list() posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = scorevec_end scorevec_end = scorevec_end if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) scorevecitem_add = numpy.zeros(sum_granu) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] #scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex]= sum(scorevecitem[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]:possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]) Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) #since the very last calls of the for loop may not be used in the mapping process we essentially use the main part of the for loop again scorevec_chr_proc = scorevec_chr scorevec_proc = numpy.asarray(scorevec) posvec = numpy.arange(scorevec_start,scorevec_end) if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_add = numpy.zeros(sum_granu) scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) for i in range(len(poschr)): for j in range(len(pos[i])): for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scaling_factor = splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]-splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex] if scaling_factor!=0: Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = (Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex])/scaling_factor #write the enrichments into a csv file #store the average profiles for TSS/TTS/Transcript regions and different granulations in a separate csv file import csv csv_open_profile = open(sysimport_profile+'.csv','w') csv_writer_profile = csv.writer(csv_open_profile) csv_list_profile = list() for k in range(len(granulationwin)): csv_open_TSS = open(sysimport_csv+'_TSS_'+str(granulationwin[k])+'.csv','w') csv_writer_TSS = csv.writer(csv_open_TSS) csv_list_TSS = list() csv_open_TTS = open(sysimport_csv+'_TTS_'+str(granulationwin[k])+'.csv','w') csv_writer_TTS = csv.writer(csv_open_TTS) csv_list_TTS = list() csv_open_Transcript = open(sysimport_csv+'_Transcript_'+str(granulationwin[k])+'.csv','w') csv_writer_Transcript = csv.writer(csv_open_Transcript) csv_list_Transcript = list() profile_TSS = numpy.zeros(granulationwin[k]) profile_TTS = numpy.zeros(granulationwin[k]) profile_Transcript = numpy.zeros(granulationwin[k]) headline = list() headline.append("Gene_ID") for i in range(granulationwin[k]): intervalname = 'Interval '+str(i+1) headline.append(intervalname) csv_list_TSS.append(headline) csv_list_TTS.append(headline) csv_list_Transcript.append(headline) for i in range(len(poschr)): if poschr[i] not in chr_not_to_be_stored: for j in range(len(pos[i])): newline = list() newline.append(pos[i][j][4]) extend_line = Itemvecs[i][j][granulationwin_cum_sum_extended[k]:granulationwin_cum_sum_extended[k+1]] if (len(extend_line)>1) and pos[i][j][3]=='-': extend_line = extend_line[::-1] newline.extend(extend_line) if pos[i][j][2]=='TSS': csv_list_TSS.append(newline) profile_TSS += extend_line if pos[i][j][2]=='TTS': csv_list_TTS.append(newline) profile_TTS += extend_line if pos[i][j][2]=='Transcript': csv_list_Transcript.append(newline) profile_Transcript += extend_line csv_writer_TSS.writerows(csv_list_TSS) csv_writer_TTS.writerows(csv_list_TTS) csv_writer_Transcript.writerows(csv_list_Transcript) csv_open_TSS.close() csv_open_TTS.close() csv_open_Transcript.close() TSS_profile = list() TSS_profile.append('TSS') TSS_profile.append(granulationwin[k]) TSS_profile.extend(profile_TSS) csv_list_profile.append(TSS_profile) TTS_profile = list() TTS_profile.append('TTS') TTS_profile.append(granulationwin[k]) TTS_profile.extend(profile_TTS) csv_list_profile.append(TTS_profile) Transcript_profile = list() Transcript_profile.append('Transcript') Transcript_profile.append(granulationwin[k]) Transcript_profile.extend(profile_Transcript) csv_list_profile.append(Transcript_profile) csv_writer_profile.writerows(csv_list_profile) csv_open_profile.close() #optional if len(sys.argv)>= 10: #check whether the processing file is of such kind such that our processing makes sense... #and store the result in sysimport_correct_type valid = True for i in range(len(processing_file_type)): if 'chr' not in processing_file_type[i]: valid = False f=open(sysimport_correct_type, "a") if valid == True: f.write('Is ok for '+processing_file+'\n') if valid == False: f.write('Not ok for '+processing_file+'\n') f.close()