#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see #This program should sort out which genes should not be taken for our data analysis #The selection should be made up of all mitochondrial protein coding or lindRNA genes etc., or genes on chr X and chr Y #and genes that intersect with regions of the ENCODE blacklist, which lists regions that should never be considered for data analysis for various reasons import sys import itertools import numpy pos = list() poschr = list() #path where the annotation csv file (the output from GTF_processing.py) can be found # e.g. "/home/user/Transcripts-v.18.csv" csv_path = sys.argv[1] gtffile = open(csv_path) #path where the consensusBlacklist can be found # e.g. "/home/user/consensusBlacklist.bed" blackblist_path = sys.argv[2] #type ('lincRNA' or 'protein_coding' etc.) type = sys.argv[3] #Folder to store gene blacklist, e.g., "/home/user/" sysimport_txt = sys.argv[4] #optional: which chromosomes should be deleted, e.g., "chrX chrY chrM" chromosomes_to_be_deleted = sys.argv[5:len(sys.argv)] # The following function outputs a list with two components which are lists themselves: #one (poschr) has as entries all chromosomes so something like 'chr1', 'chr2' etc. #the other one (pos) has as entries the TSS and TTS for the considered considered transcript plus strand information and gene ID # so something like [1000,2000,'+','ENSGXXXXXXXX.X'],[20000,3000,'-','ENSGXXXXXXXXXX.X'] etc. #we consider the outmost TSS/TTS for all transcripts for one gene ID (which of course depends on the strand info, #so for '+' for take the minimum of all TSS for all transcripts for one gene ID and max for '-') and if False one takes all transcripts individually #this is because then the profiles of histone modifications etc. get as clear as possible since the overlapping profiles of different, say, TSSs for one gene don't affect each other so much then #'transcript_type' may as values 'lncRNA' or 'protein_coding' etc. #gtffile is clear from above def transcriptprocessing(transcript_type,gtffile): pos = list() poschr = list() currentgeneid = '0' currentstartval = -1 currentendval = -1 currentchr = '0' strand = '0' currenttype = '0' for line in gtffile: z = line.split(",") if z[0]==currentgeneid: if z[2]==transcript_type: if strand == "+": currentstartval = min(currentstartval,int(z[4]),int(z[5])) currentendval = max(currentendval,int(z[4]),int(z[5])) if strand == "-": currentstartval = max(currentstartval,int(z[4]),int(z[5])) currentendval = min(currentendval,int(z[4]),int(z[5])) else: if currenttype==transcript_type: if currentgeneid != '0': if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) currentchr = z[3] currenttype = z[2] currentgeneid = z[0] if z[6] == "+\r\n": strand = '+' currentstartval = min(int(z[4]),int(z[5])) currentendval = max(int(z[4]),int(z[5])) if z[6] == "-\r\n": strand = '-' currentstartval = max(int(z[4]),int(z[5])) currentendval = min(int(z[4]),int(z[5])) if currenttype==transcript_type: if currentchr in poschr: pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) else: newlist = list() newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid]) pos.append(newlist) poschr.append(currentchr) return [poschr,pos] #establishing poschr_feature and pos_feature poschr_feature = list() pos_feature = list() [poschr_feature,pos_feature] = transcriptprocessing(type,gtffile) #add blacklist z = open(blackblist_path) for line in z: z1 = line.split() if z1[0] not in poschr: poschr.append(z1[0]) newlist = list() newlist.append([int(z1[1]),int(z1[2])-1,'.','to be deleted']) pos.append(newlist) else: pos[poschr.index(z1[0])].append([int(z1[1]),int(z1[2])-1,'.','to be deleted']) #we sort the TSS/TTS/Gene bodies with regards to the position on the chromosome from operator import itemgetter for i in range(len(poschr)): pos[i] = sorted(pos[i], key=itemgetter(0)) from operator import itemgetter for i in range(len(poschr_feature)): pos_feature[i] = sorted(pos_feature[i], key=itemgetter(0)) #this function should help us later #in essence it just says that to each chr i and k*n what is the lowest index j such that the minimum position of the item in the list pos[i] for chr i and with position j in this list #has a chromosome position >= k*n def granulationpointer(x,n): out = list() for i in range(len(x)): outi = list() z = 0 for j in range(len(x[i])): while z<=int(x[i][j][0]): outi.append(j) z += n out.append(numpy.asarray(outi)) return out npointer = 3*(10 ** 6) gpointer = granulationpointer(pos_feature,npointer) lengthgpointer = list() for i in range(len(poschr_feature)): lengthgpointer.append(len(gpointer[i])) lengthgpointer = numpy.asarray(lengthgpointer) write_list = list() oldperformchr = '0' #loop through the genes of each chromosome from poschr_feature and check whether there is some overlap with the blacklist regions and add them to the remove list if that is the case #also remove every gene on the chromosomes in chromosomes_to_be_deleted for i in range(len(pos_feature)): scorevec_chr_proc = poschr_feature[i] if scorevec_chr_proc in chromosomes_to_be_deleted: for j in range(len(pos_feature[i])): write_list.append(pos_feature[i][j][3]+'\n') continue for j in range(len(pos_feature[i])): minval1 = pos_feature[i][j][0] maxval1 = pos_feature[i][j][1] maxval = pos_feature[i][j][0]+2 minval = max(0,pos_feature[i][j][1]-2) nmax = (maxval+((npointer-maxval)%npointer))/npointer nmin = (minval - (minval%npointer))/npointer if oldperformchr != scorevec_chr_proc: itemposchrindex = poschr.index(scorevec_chr_proc) lengpointer = lengthgpointer[itemposchrindex] oldperformchr = scorevec_chr_proc itemposperform = pos[itemposchrindex] gpointervec = gpointer[itemposchrindex] currentitemvecpos = pos[itemposchrindex] if lengpointer==1: possibleitemvecpos = currentitemvecpos start=0 else: if nmax >= lengpointer: if nmin >= lengpointer: possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))] else: possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)] if nmin >= lengpointer: start = gpointervec[-1] else: start = gpointervec[nmin] should_be_appended = False for position in range(len(possibleitemvecpos)): if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1: should_be_appended = True if should_be_appended == True: write_list.append(pos_feature[i][j][3]+'\n') f = open(sysimport_txt+type+"_Not_to_be_considered.txt",'w') for write_list_line in write_list: f.write(write_list_line) f.close()