#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see import sys import itertools import numpy import HTSeq import csv #path where the annotation csv file (the output from GTF_processing.py) can be found # e.g. "/home/user/Transcripts-v.18.csv" csv_path = sys.argv[1] gtffile = open(csv_path) #path where the human genome fasta file (e.g. hg19) can be found # e.g. "/home/user/hg19.fa" fa_path = sys.argv[2] #type ('lincRNA' or 'protein_coding' etc.) type = sys.argv[3] #halfwinwidth is the range, e.g., +/- halfwinwidth around the TSS/TTS..., suggested value 2000 halfwinwidth = int(sys.argv[4]) #granulationwin is the vector for numbers of windows the area around the TSS/TTS or the entire transcript is split into..., suggested 1 and 40, Input: "1,40" granulationwin_sys = sys.argv[5] granulationwin_sys = granulationwin_sys.split(",") granulationwin = list() for win_here in granulationwin_sys: granulationwin.append(int(win_here)) granulationwin = numpy.asarray(granulationwin) granulationwin_cum_sum = numpy.cumsum(granulationwin) granulationwin_cum_sum_extended = numpy.append(0,granulationwin_cum_sum) sum_granu = sum(granulationwin) #Folder to store profile matrix, as sys.argv input you may take, e.g., "/home/user/" sysimport_csv = sys.argv[6]+type #Folder to store CpG count matrices, as sys.argv input you may take, e.g., "/home/user/" sysimport_profile = sys.argv[7]+type # The following function outputs a list with two components which are lists themselves: #one (poschr) has as entries all chromosomes so something like 'chr1', 'chr2' etc. #the other one (pos) has as entries the TSS and TTS for the considered considered transcript plus strand information and gene ID # so something like [1000,2000,'+','ENSGXXXXXXXX.X'],[20000,3000,'-','ENSGXXXXXXXXXX.X'] etc. #we consider the outmost TSS/TTS for all transcripts for one gene ID (which of course depends on the strand info, #so for '+' for take the minimum of all TSS for all transcripts for one gene ID and max for '-') and if False one takes all transcripts individually #this is because then the profiles of histone modifications etc. get as clear as possible since the overlapping profiles of different, say, TSSs for one gene don't affect each other so much then #'transcript_type' may as values 'lncRNA' or 'protein_coding' etc. #gtffile is clear from above def transcriptprocessing(transcript_type,gtffile): pos = list() poschr = list() currentgeneid = '0' currentstartval = -1 currentendval = -1 currentchr = '0' strand = '0' currenttype = '0' for line in gtffile: z = line.split(",") if z[0]==currentgeneid: if z[2]==transcript_type: if strand == "+": currentstartval = min(currentstartval,int(z[4]),int(z[5])) currentendval = max(currentendval,int(z[4]),int(z[5])) if strand == "-": currentstartval = max(currentstartval,int(z[4]),int(z[5])) currentendval = min(currentendval,int(z[4]),int(z[5])) else: if currenttype==transcript_type: if currentgeneid != '0': if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) currentchr = z[3] currenttype = z[2] currentgeneid = z[0] if z[6] == "+\r\n": strand = '+' currentstartval = min(int(z[4]),int(z[5])) currentendval = max(int(z[4]),int(z[5])) if z[6] == "-\r\n": strand = '-' currentstartval = max(int(z[4]),int(z[5])) currentendval = min(int(z[4]),int(z[5])) if currenttype==transcript_type: if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) return [poschr,pos] #establishing poschr and pos [poschr,pos] = transcriptprocessing(type,gtffile) #check if all chromosomes (including chrM) are in is pochr and pos #if not we will add everything including a "fake" gene, which will not be stored chr_not_to_be_stored = list() control_poschr_list = list(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']) for chr in control_poschr_list: if chr not in poschr: poschr.append(chr) chr_not_to_be_stored.append(chr) newlist = list() newlist.append([1,10,'+','to_be_deleted']) pos.append(newlist) #splitting the lines up into TSS/Transcript/TTS regions for i in range(len(poschr)): newlist = list() for z in pos[i]: if z[2]=='+': newlist.append([int(z[0])-halfwinwidth,int(z[0])+halfwinwidth,'TSS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth,int(z[1])+halfwinwidth,'TTS',z[2],z[3]]) if z[2]=='-': newlist.append([int(z[0])-halfwinwidth+1,int(z[0])+halfwinwidth+1,'TTS',z[2],z[3]]) newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]]) newlist.append([int(z[1])-halfwinwidth+1,int(z[1])+halfwinwidth+1,'TSS',z[2],z[3]]) pos[i]=newlist #we sort the TSS/TTS/Gene bodies with regards to the position on the chromosome from operator import itemgetter for i in range(len(poschr)): pos[i] = sorted(pos[i], key=itemgetter(0)) #in essence this function just says that to each chr i and k*n what is the lowest index j such that the minimum position of the item in the list pos[i] for chr i and with position j in this list #has a chromosome position >= k*n def granulationpointer(x,n): out = list() for i in range(len(x)): outi = list() z = 0 for j in range(len(x[i])): while z<=int(x[i][j][0]): outi.append(j) z += n out.append(numpy.asarray(outi)) return out #The following function generates a vector for each length and times input, such that it has a entries zero and equidistributed further entries which together split the vector #[0,1,...,length] into (nearly) equal windows def splitting(length,times,orientation): x = numpy.arange(0,length,int(numpy.ceil(float(length)/float(times)))) while len(x)<(times+1): x = numpy.append(x,length) if orientation == '-': x = numpy.array((times+1)*[length])-x x = x[::-1] return x #Itemvecs has in the end to each chr and each TSS/TTS/... a vector with granulationwin vectors of length granulationwin[k] to which we map potential enrichment value in the respective window #splitting_vec gives the granulation windows indices for each item of interest Itemvecs = list() splitting_vec = list() for i in range(len(poschr)): itemveci = list() splitting_vec_i = list() for j in range(len(pos[i])): itemvecj = numpy.zeros(sum_granu,dtype='float32') splitting_vec_j = numpy.array([],dtype='int32') for k in range(len(granulationwin)): splitting_vec_j = numpy.append(splitting_vec_j,splitting(pos[i][j][1]-pos[i][j][0],granulationwin[k],pos[i][j][3])) splitting_vec_i.append(numpy.asarray(splitting_vec_j)) itemveci.append(itemvecj) Itemvecs.append(numpy.asarray(itemveci)) splitting_vec.append(numpy.asarray(splitting_vec_i)) #load the Fasta file a = dict( (s.name, s) for s in HTSeq.FastaReader(fa_path)) n=0 #Count for each region of interest the number of CpG falling into that region for i in range(len(poschr)): x = a[poschr[i]].seq for j in range(len(pos[i])): x_seq = x[(pos[i][j][0]-1):(pos[i][j][1]+1)] x_seq = x_seq.upper() x_no_seq = numpy.array((pos[i][j][1]-pos[i][j][0])*[0]) for k in range(len(x_seq)-1): if k==0: if x_seq[k] == 'C' and x_seq[k+1] == 'G': x_no_seq[0]+=1 elif k==len(x_seq)-2: if x_seq[k] == 'C' and x_seq[k+1] == 'G': x_no_seq[k-1]+=1 else: if x_seq[k] == 'C' and x_seq[k+1] == 'G': x_no_seq[k]+=1 x_no_seq[k-1]+=1 scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(x_no_seq)) scorevecitem_add = numpy.zeros(sum_granu) for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]] Itemvecs[i][j] += scorevecitem_add #divide the count values for each region by the size of that region for i in range(len(poschr)): for j in range(len(pos[i])): for granulationwin_index in range(len(granulationwin)): for reductionindex in range(granulationwin[granulationwin_index]): scaling_factor = splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]-splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex] if scaling_factor!=0: Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = (Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex])/scaling_factor #write the enrichments into a csv file #store the average profiles for TSS/TTS/Transcript regions and different granulations in a separate csv file csv_open_profile = open(sysimport_profile+'.csv','w') csv_writer_profile = csv.writer(csv_open_profile) csv_list_profile = list() for k in range(len(granulationwin)): csv_open_TSS = open(sysimport_csv+'_TSS_'+str(granulationwin[k])+'.csv','w') csv_writer_TSS = csv.writer(csv_open_TSS) csv_list_TSS = list() csv_open_TTS = open(sysimport_csv+'_TTS_'+str(granulationwin[k])+'.csv','w') csv_writer_TTS = csv.writer(csv_open_TTS) csv_list_TTS = list() csv_open_Transcript = open(sysimport_csv+'_Transcript_'+str(granulationwin[k])+'.csv','w') csv_writer_Transcript = csv.writer(csv_open_Transcript) csv_list_Transcript = list() profile_TSS = numpy.zeros(granulationwin[k]) profile_TTS = numpy.zeros(granulationwin[k]) profile_Transcript = numpy.zeros(granulationwin[k]) headline = list() headline.append("Gene_ID") for i in range(granulationwin[k]): intervalname = 'Interval '+str(i+1) headline.append(intervalname) csv_list_TSS.append(headline) csv_list_TTS.append(headline) csv_list_Transcript.append(headline) for i in range(len(poschr)): if poschr[i] not in chr_not_to_be_stored: for j in range(len(pos[i])): newline = list() newline.append(pos[i][j][4]) extend_line = Itemvecs[i][j][granulationwin_cum_sum_extended[k]:granulationwin_cum_sum_extended[k+1]] if (len(extend_line)>1) and pos[i][j][3]=='-': extend_line = extend_line[::-1] newline.extend(extend_line) if pos[i][j][2]=='TSS': csv_list_TSS.append(newline) profile_TSS += extend_line if pos[i][j][2]=='TTS': csv_list_TTS.append(newline) profile_TTS += extend_line if pos[i][j][2]=='Transcript': csv_list_Transcript.append(newline) profile_Transcript += extend_line csv_writer_TSS.writerows(csv_list_TSS) csv_writer_TTS.writerows(csv_list_TTS) csv_writer_Transcript.writerows(csv_list_Transcript) csv_open_TSS.close() csv_open_TTS.close() csv_open_Transcript.close() TSS_profile = list() TSS_profile.append('TSS') TSS_profile.append(granulationwin[k]) TSS_profile.extend(profile_TSS) csv_list_profile.append(TSS_profile) TTS_profile = list() TTS_profile.append('TTS') TTS_profile.append(granulationwin[k]) TTS_profile.extend(profile_TTS) csv_list_profile.append(TTS_profile) Transcript_profile = list() Transcript_profile.append('Transcript') Transcript_profile.append(granulationwin[k]) Transcript_profile.extend(profile_Transcript) csv_list_profile.append(Transcript_profile) csv_writer_profile.writerows(csv_list_profile) csv_open_profile.close()