#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see import sys import itertools import numpy import wiggelen #path where the annotation csv file (the output from GTF_processing.py) can be found # e.g. "/home/user/Transcripts-v.18.csv" csv_path = sys.argv[1] gtffile = open(csv_path) #path where the file, that has to be processed, can be found # e.g. "/home/user/example.wig" processing_file = sys.argv[2] #type ('lincRNA' or 'protein_coding' etc.) type = sys.argv[3] #halfwinwidth is the range, e.g., +/- halfwinwidth around the TSS/TTS..., suggested value 2000 halfwinwidth = int(sys.argv[4]) #granulationwin is the vector for numbers of windows the area around the TSS/TTS or the entire transcript is split into..., suggested 1 and 40, Input: "1,40" granulationwin_sys = sys.argv[5] granulationwin_sys = granulationwin_sys.split(",") granulationwin = list() for win_here in granulationwin_sys: granulationwin.append(int(win_here)) granulationwin = numpy.asarray(granulationwin) granulationwin_cum_sum = numpy.cumsum(granulationwin) granulationwin_cum_sum_extended = numpy.append(0,granulationwin_cum_sum) sum_granu = sum(granulationwin) #name under which the profile and region count data should be stored (corresponds to input no. 8 in the manual) store_names = sys.argv[8] #Folder to store profile matrix, as sys.argv input you may take, e.g., "/home/user/" sysimport_csv = sys.argv[6]+store_names #Folder to store CpG count matrices, as sys.argv input you may take, e.g., "/home/user/" sysimport_profile = sys.argv[7]+store_names pos = list() poschr = list() # The following function outputs a list with two components which are lists themselves: #one (poschr) has as entries all chromosomes so something like 'chr1', 'chr2' etc. #the other one (pos) has as entries the TSS and TTS for the considered considered transcript plus strand information and gene ID # so something like [1000,2000,'+','ENSGXXXXXXXX.X'],[20000,3000,'-','ENSGXXXXXXXXXX.X'] etc. #we consider the outmost TSS/TTS for all transcripts for one gene ID (which of course depends on the strand info, #so for '+' for take the minimum of all TSS for all transcripts for one gene ID and max for '-') and if False one takes all transcripts individually #this is because then the profiles of histone modifications etc. get as clear as possible since the overlapping profiles of different, say, TSSs for one gene don't affect each other so much then #'transcript_type' may as values 'lncRNA' or 'protein_coding' etc. #gtffile is clear from above def transcriptprocessing(transcript_type,gtffile): pos = list() poschr = list() currentgeneid = '0' currentstartval = -1 currentendval = -1 currentchr = '0' strand = '0' currenttype = '0' for line in gtffile: z = line.split(",") if z[0]==currentgeneid: if z[2]==transcript_type: if strand == "+": currentstartval = min(currentstartval,int(z[4]),int(z[5])) currentendval = max(currentendval,int(z[4]),int(z[5])) if strand == "-": currentstartval = max(currentstartval,int(z[4]),int(z[5])) currentendval = min(currentendval,int(z[4]),int(z[5])) else: if currenttype==transcript_type: if currentgeneid != '0': if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) currentchr = z[3] currenttype = z[2] currentgeneid = z[0] if z[6] == "+\r\n": strand = '+' currentstartval = min(int(z[4]),int(z[5])) currentendval = max(int(z[4]),int(z[5])) if z[6] == "-\r\n": strand = '-' currentstartval = max(int(z[4]),int(z[5])) currentendval = min(int(z[4]),int(z[5])) if currenttype==transcript_type: if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) return [poschr,pos] #establishing poschr and pos [poschr,pos] = transcriptprocessing(type,gtffile) #check if all chromosomes (including chrM) are in is pochr and pos #if not we will add everything including a "fake" gene, which will not be stored chr_not_to_be_stored = list() control_poschr_list = list(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']) for chr in control_poschr_list: if chr not in poschr: poschr.append(chr) chr_not_to_be_stored.append(chr) newlist = list() newlist.append([1,10,'+','to_be_deleted']) pos.append(newlist) #splitting the lines up into TSS/Transcript/TTS regions for i in range(len(poschr)): newlist = list() for z in pos[i]: if z[2]=='+': newlist.append([int(z[0])-halfwinwidth,int(z[0])+halfwinwidth,'TSS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth,int(z[1])+halfwinwidth,'TTS',z[2],z[3]]) if z[2]=='-': newlist.append([int(z[0])-halfwinwidth+1,int(z[0])+halfwinwidth+1,'TTS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth+1,int(z[1])+halfwinwidth+1,'TSS',z[2],z[3]]) pos[i]=newlist #we sort the TSS/TTS/Gene bodies with regards to the position on the chromosome from operator import itemgetter for i in range(len(poschr)): pos[i] = sorted(pos[i], key=itemgetter(0)) #this function should help us later #in essence it just says that to each chr i and k*n what is the lowest index j such that the minimum position of the item in the list pos[i] for chr i and with position j in this list #has a chromosome position >= k*n def granulationpointer(x,n): out = list() for i in range(len(x)): outi = list() z = 0 for j in range(len(x[i])): while z<=int(x[i][j][0]): outi.append(j) z += n out.append(numpy.asarray(outi)) return out #The following function generates a vector for each length and times input, such that it has a entries zero and equidistributed further entries which together split the vector #[0,1,...,length] into (nearly) equal windows def splitting(length,times,orientation): x = numpy.arange(0,length,int(numpy.ceil(float(length)/float(times)))) while len(x)<(times+1): x = numpy.append(x,length) if orientation == '-': x = numpy.array((times+1)*[length])-x x = x[::-1] return x #Itemvecs has in the end to each chr and each TSS/TTS/... a vector with granulationwin vectors of length granulationwin[k] to which we map potential enrichment value in the respective window #splitting_vec gives the granulation windows indices for each item of interest Itemvecs = list() splitting_vec = list() for i in range(len(poschr)): itemveci = list() splitting_vec_i = list() for j in range(len(pos[i])): itemvecj = numpy.zeros(sum_granu,dtype='float32') splitting_vec_j = numpy.array([],dtype='int32') for k in range(len(granulationwin)): splitting_vec_j = numpy.append(splitting_vec_j,splitting(pos[i][j][1]-pos[i][j][0],granulationwin[k],pos[i][j][3])) splitting_vec_i.append(numpy.asarray(splitting_vec_j)) itemveci.append(itemvecj) Itemvecs.append(numpy.asarray(itemveci)) splitting_vec.append(numpy.asarray(splitting_vec_i)) #connect to data file a = open(processing_file) npointer = 3*(10 ** 6) gpointer = granulationpointer(pos,npointer) lengthgpointer = list() for i in range(len(poschr)): lengthgpointer.append(len(gpointer[i])) lengthgpointer = numpy.asarray(lengthgpointer) breaklength = 100000 lastchr = '0' lastpos = -1 oldperformchr = '0' itemposchrindex = 0 lengpointer = 0 itemposperform = 0 gpointervec = 0 currentitemvecpos = 0 current_splitting_vec = 0 #what comes now is in essence: #go through the processing file #collect several (at most breaklength) values in a certain region #use gpointer resp. gpointervec to figure out the potential TSS/TTS/... that may intersect with these enrichment calls #if there is an overlap map them to the respective vector in Itemvecs #at first we try it fixedStep wig files (which is quicker), if there is also variableStep in it (var_step = True), we stop and process below with wiggelen n=0 step_mode = '0' scorevec_start = -1 scorevec_end = -1 scorevec_chr = '0' scorevec_chr_proc = 0 scorevec_proc = 0 scorevec = list() scorevec_proc = numpy.asarray([]) processing_file_type = list() processing_file_frequency = list() processing_file_firstocurrence = list() var_step = False for line in a: n +=1 linesplit = line.split() if linesplit[0] in processing_file_type: processing_file_frequency[processing_file_type.index(linesplit[0])] = processing_file_frequency[processing_file_type.index(linesplit[0])]+1 else: processing_file_type.append(linesplit[0]) processing_file_firstocurrence.append(n) processing_file_frequency.append(1) if linesplit[0] == 'fixedStep': region = linesplit[1].split('=')[1] position = int(linesplit[2].split('=')[1]) step = int(linesplit[3].split('=')[1]) span = int(linesplit[4].split('=')[1]) if scorevec_chr != region or position != scorevec_end: scorevec_chr_proc = scorevec_chr scorevec_chr = region scorevec_proc = numpy.asarray(scorevec) scorevec = list() posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = position scorevec_end = position elif linesplit[0] == 'variableStep': var_step = True break elif n>1: scorevec.extend(span*[float(linesplit[0])]) if span != step: scorevec.extend((step-span)*[0]) scorevec_end += step if len(scorevec)>=breaklength: scorevec_chr_proc = scorevec_chr scorevec_chr = scorevec_chr scorevec_proc = numpy.asarray(scorevec) scorevec = list() posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = scorevec_end scorevec_end = scorevec_end if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) scorevecitem_add = numpy.zeros(sum_granu) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] #scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex]= sum(scorevecitem[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]:possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]) Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) #since the very last calls of the for loop may not be used in the mapping process we essentially use the main part of the for loop again scorevec_chr_proc = scorevec_chr scorevec_proc = numpy.asarray(scorevec) posvec = numpy.arange(scorevec_start,scorevec_end) if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_add = numpy.zeros(sum_granu) scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) #variableStep in it (var_step = True), we process below with wiggelen if var_step == True: #Itemvecs has in the end to each chr and each TSS/TTS/... a vector with granulationwin vectors of length granulationwin[k] to which we map potential enrichment value in the respective window #splitting_vec gives the granultion windows indices for each item of interest Itemvecs = list() splitting_vec = list() for i in range(len(poschr)): itemveci = list() splitting_vec_i = list() for j in range(len(pos[i])): itemvecj = numpy.zeros(sum_granu,dtype='float32') splitting_vec_j = numpy.array([],dtype='int32') for k in range(len(granulationwin)): splitting_vec_j = numpy.append(splitting_vec_j,splitting(pos[i][j][1]-pos[i][j][0],granulationwin[k],pos[i][j][3])) splitting_vec_i.append(numpy.asarray(splitting_vec_j)) itemveci.append(itemvecj) Itemvecs.append(numpy.asarray(itemveci)) splitting_vec.append(numpy.asarray(splitting_vec_i)) #connect to data file with the wiggelen package a = wiggelen.walk(open(processing_file)) npointer = 3*(10 ** 6) gpointer = granulationpointer(pos,npointer) lengthgpointer = list() for i in range(len(poschr)): lengthgpointer.append(len(gpointer[i])) lengthgpointer = numpy.asarray(lengthgpointer) breaklength = 100000 lastchr = '0' lastpos = -1 oldperformchr = '0' itemposchrindex = 0 lengpointer = 0 itemposperform = 0 gpointervec = 0 currentitemvecpos = 0 current_splitting_vec = 0 #what comes now is in essence: #go through wig file #collect several (at most breaklength) values in a certain region #use gpointer resp. gpointervec to figure out the potential TSS/TTS/... that may intersect with these enrichment calls #if there is an overlap map them to the respective vector in Itemvecs n=0 scorevec_start = -1 scorevec_end = -1 scorevec_chr = '0' scorevec_chr_proc = 0 scorevec_proc = 0 scorevec = list() scorevec_proc = numpy.asarray([]) for region, position, value in a: if scorevec_chr != region: scorevec_chr_proc = scorevec_chr scorevec_chr = region scorevec_proc = numpy.asarray(scorevec) scorevec = list() scorevec.append(value) posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = position scorevec_end = position+1 else: scorevec.extend(numpy.zeros(position-scorevec_end,dtype='float32')) scorevec.append(value) scorevec_end = position+1 if len(scorevec)>=breaklength: scorevec_chr_proc = scorevec_chr scorevec_chr = scorevec_chr scorevec_proc = numpy.asarray(scorevec) scorevec = list() posvec = numpy.arange(scorevec_start,scorevec_end) scorevec_start = scorevec_end scorevec_end = scorevec_end if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) scorevecitem_add = numpy.zeros(sum_granu) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] #scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex]= sum(scorevecitem[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]:possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]) Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) #since the very last calls of the for loop may not be used in the mapping process we essentially use the main part of the for loop again scorevec_chr_proc = scorevec_chr scorevec_proc = numpy.asarray(scorevec) posvec = numpy.arange(scorevec_start,scorevec_end) if len(scorevec_proc)>0: maxval = posvec[-1]+halfwinwidth+1 minval = max(0,posvec[0]-halfwinwidth-1) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] current_splitting_vec = splitting_vec[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos possible_splitting_vec = current_splitting_vec start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] minval1 = posvec[0] maxval1 = posvec[-1] for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0]) start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0]) start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0]) end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1 end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc] scorevecitem_add = numpy.zeros(sum_granu) scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem)) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] Itemvecs[itemposchrindex][start] += scorevecitem_add start += 1 scorevec_proc = numpy.array([]) for i in range(len(poschr)): for j in range(len(pos[i])): for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scaling_factor = splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]-splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex] if scaling_factor!=0: Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = (Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex])/scaling_factor #write the enrichments into a csv file #store the average profiles for TSS/TTS/Transcript regions and different granulations in a separate csv file import csv csv_open_profile = open(sysimport_profile+'.csv','w') csv_writer_profile = csv.writer(csv_open_profile) csv_list_profile = list() for k in range(len(granulationwin)): csv_open_TSS = open(sysimport_csv+'_TSS_'+str(granulationwin[k])+'.csv','w') csv_writer_TSS = csv.writer(csv_open_TSS) csv_list_TSS = list() csv_open_TTS = open(sysimport_csv+'_TTS_'+str(granulationwin[k])+'.csv','w') csv_writer_TTS = csv.writer(csv_open_TTS) csv_list_TTS = list() csv_open_Transcript = open(sysimport_csv+'_Transcript_'+str(granulationwin[k])+'.csv','w') csv_writer_Transcript = csv.writer(csv_open_Transcript) csv_list_Transcript = list() profile_TSS = numpy.zeros(granulationwin[k]) profile_TTS = numpy.zeros(granulationwin[k]) profile_Transcript = numpy.zeros(granulationwin[k]) headline = list() headline.append("Gene_ID") for i in range(granulationwin[k]): intervalname = 'Interval '+str(i+1) headline.append(intervalname) csv_list_TSS.append(headline) csv_list_TTS.append(headline) csv_list_Transcript.append(headline) for i in range(len(poschr)): if poschr[i] not in chr_not_to_be_stored: for j in range(len(pos[i])): newline = list() newline.append(pos[i][j][4]) extend_line = Itemvecs[i][j][granulationwin_cum_sum_extended[k]:granulationwin_cum_sum_extended[k+1]] if (len(extend_line)>1) and pos[i][j][3]=='-': extend_line = extend_line[::-1] newline.extend(extend_line) if pos[i][j][2]=='TSS': csv_list_TSS.append(newline) profile_TSS += extend_line if pos[i][j][2]=='TTS': csv_list_TTS.append(newline) profile_TTS += extend_line if pos[i][j][2]=='Transcript': csv_list_Transcript.append(newline) profile_Transcript += extend_line csv_writer_TSS.writerows(csv_list_TSS) csv_writer_TTS.writerows(csv_list_TTS) csv_writer_Transcript.writerows(csv_list_Transcript) csv_open_TSS.close() csv_open_TTS.close() csv_open_Transcript.close() TSS_profile = list() TSS_profile.append('TSS') TSS_profile.append(granulationwin[k]) TSS_profile.extend(profile_TSS) csv_list_profile.append(TSS_profile) TTS_profile = list() TTS_profile.append('TTS') TTS_profile.append(granulationwin[k]) TTS_profile.extend(profile_TTS) csv_list_profile.append(TTS_profile) Transcript_profile = list() Transcript_profile.append('Transcript') Transcript_profile.append(granulationwin[k]) Transcript_profile.extend(profile_Transcript) csv_list_profile.append(Transcript_profile) csv_writer_profile.writerows(csv_list_profile) csv_open_profile.close()