#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program.  If not, see <http://www.gnu.org/licenses/>


#This program is only intended to work if the bedGraph looks as follows (which should be the case at least for (most of) the ENCODE data):
#Every line looks like
    #chromosome start end value, so for instance
    #chr1 1 1000 4

import sys
import itertools
import numpy


#path where the annotation csv file (the output from GTF_processing.py) can be found
# e.g. "/home/user/Transcripts-v.18.csv"
csv_path = sys.argv[1]
gtffile = open(csv_path)

#path where the file, that has to be processed, can be found
# e.g. "/home/user/example.BedGraph"
processing_file = sys.argv[2]

#type ('lincRNA' or 'protein_coding' etc.)
type = sys.argv[3]


#halfwinwifth is the range, e.g., +/- halfwinwidth around the TSS/TTS..., suggested value 2000
halfwinwidth = int(sys.argv[4])

#granulationwin is the vector for numbers of windows the area around the TSS/TTS or the entire transcript is split into..., suggested 1 and 40, Input: "1,40"
granulationwin_sys = sys.argv[5]
granulationwin_sys = granulationwin_sys.split(",")
granulationwin = list()
for win_here in granulationwin_sys:
    granulationwin.append(int(win_here))
    
granulationwin = numpy.asarray(granulationwin)
granulationwin_cum_sum = numpy.cumsum(granulationwin)
granulationwin_cum_sum_extended = numpy.append(0,granulationwin_cum_sum)
sum_granu = sum(granulationwin) 


#name under which the profile and region count data should be stored (corresponds to input no. 8 in the manual)
store_names = sys.argv[8]

#Folder to store profile matrix, as sys.argv input you may take, e.g., "/home/user/" 
sysimport_csv = sys.argv[6]+store_names
#Folder to store CpG count matrices, as sys.argv input you may take, e.g., "/home/user/" 
sysimport_profile = sys.argv[7]+store_names

#optional: store in file whether or not the file format has been ok for our purposes - works at least for BigWig ENCODE file that have been transformed to BedGraph via bigWigToBedGraph UCSC tool
#input example: "/home/user/correct_type.txt"
if len(sys.argv)>= 10:
    sysimport_correct_type = sys.argv[9]


pos = list()
poschr = list()


# The following function outputs a list with two components which are lists themselves: 
#one (poschr) has as entries all chromosomes so something like 'chr1', 'chr2' etc.
#the other one (pos) has as entries the TSS and TTS for the considered considered transcript plus strand information and gene ID
# so something like [1000,2000,'+','ENSGXXXXXXXX.X'],[20000,3000,'-','ENSGXXXXXXXXXX.X'] etc.
#we consider the outmost TSS/TTS for all transcripts for one gene ID (which of course depends on the strand info, 
#so for '+' for take the minimum of all TSS for all transcripts for one gene ID and max for '-') and if False one takes all transcripts individually
#this is because then the profiles of histone modifications etc. get as clear as possible since the overlapping profiles of different, say, TSSs for one gene don't affect each other so much then
#'transcript_type' may as values 'lncRNA' or 'protein_coding' etc. 
#gtffile is clear from above 
def transcriptprocessing(transcript_type,gtffile): 
    pos = list()
    poschr = list()


    currentgeneid = '0'
    currentstartval = -1
    currentendval = -1
    currentchr = '0'
    strand = '0'
    currenttype = '0'
    for line in gtffile:
        z = line.split(",")
        if z[0]==currentgeneid:
            if  z[2]==transcript_type:
                if strand == "+":
                    currentstartval = min(currentstartval,int(z[4]),int(z[5]))
                    currentendval = max(currentendval,int(z[4]),int(z[5]))
                if strand == "-":
                    currentstartval = max(currentstartval,int(z[4]),int(z[5]))
                    currentendval = min(currentendval,int(z[4]),int(z[5]))
        else:
            if currenttype==transcript_type:
                if currentgeneid != '0':
                    if currentchr in poschr:
                        pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid])
                    else:
                        newlist = list()
                        newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid])
                        pos.append(newlist)
                        poschr.append(currentchr)
            currentchr = z[3]
            currenttype = z[2]
            currentgeneid = z[0]
            if z[6] == "+\r\n":
                strand = '+'
                currentstartval = min(int(z[4]),int(z[5]))
                currentendval = max(int(z[4]),int(z[5]))
            if z[6] == "-\r\n":
                strand = '-'
                currentstartval = max(int(z[4]),int(z[5]))
                currentendval = min(int(z[4]),int(z[5]))
    if currenttype==transcript_type:
        if currentchr in poschr:
            pos[poschr.index(currentchr)].append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid])
        else:
            newlist = list()
            newlist.append([min(currentstartval,currentendval),max(currentstartval,currentendval),strand,currentgeneid])
            pos.append(newlist)
            poschr.append(currentchr)
        
    return [poschr,pos]


#establishing poschr and pos
[poschr,pos] = transcriptprocessing(type,gtffile)

#check if all chromosomes (including chrM) are in is pochr and pos
#if not we will add everything including a "fake" gene, which will not be stored
chr_not_to_be_stored = list()
control_poschr_list = list(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                            'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
                            'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'])

for chr in control_poschr_list:
    if chr not in poschr:
        poschr.append(chr)
        chr_not_to_be_stored.append(chr)        
        newlist = list()
        newlist.append([1,10,'+','to_be_deleted'])
        pos.append(newlist)


#splitting the lines up into TSS/Transcript/TTS regions
for i in range(len(poschr)):
    newlist = list()
    for z in pos[i]:
        if z[2]=='+':
            newlist.append([int(z[0])-halfwinwidth,int(z[0])+halfwinwidth,'TSS',z[2],z[3]])
            newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]])
            newlist.append([int(z[1])-halfwinwidth,int(z[1])+halfwinwidth,'TTS',z[2],z[3]])
        if z[2]=='-':
            newlist.append([int(z[0])-halfwinwidth+1,int(z[0])+halfwinwidth+1,'TTS',z[2],z[3]])
            newlist.append([int(z[0]),int(z[1])+1,'Transcript',z[2],z[3]])
            newlist.append([int(z[1])-halfwinwidth+1,int(z[1])+halfwinwidth+1,'TSS',z[2],z[3]])
    pos[i]=newlist

#we sort the TSS/TTS/Gene bodies with regards to the position on the chromosome
from operator import itemgetter
for i in range(len(poschr)):
    pos[i] = sorted(pos[i], key=itemgetter(0))


#in essence this function just says that to each chr i and k*n what is the lowest index j such that the minimum position of the item in the list pos[i] for chr i and with position j in this list
#has a chromosome position >= k*n 
def granulationpointer(x,n):
    out = list()
    for i in range(len(x)):
        outi = list()
        z = 0
        for j in range(len(x[i])):
            while z<=int(x[i][j][0]):
                outi.append(j)
                z += n
        out.append(numpy.asarray(outi))
    return out


#The following function generates a vector for each length and times input, such that it has a entries zero and equidistributed further entries which together split the vector 
#[0,1,...,length] into (nearly) equal windows
def splitting(length,times,orientation):
    x = numpy.arange(0,length,int(numpy.ceil(float(length)/float(times))))
    while len(x)<(times+1):
        x = numpy.append(x,length)
    if orientation == '-':
        x = numpy.array((times+1)*[length])-x
        x = x[::-1]
    return x


#Itemvecs has in the end to each chr and each TSS/TTS/... a vector with granulationwin vectors of length granulationwin[k] to which we map potential enrichment value in the respective window
#splitting_vec gives the granulation windows indices for each item of interest 
Itemvecs = list()
splitting_vec = list()
for i in range(len(poschr)):
    itemveci = list()
    splitting_vec_i = list()
    for j in range(len(pos[i])):
        itemvecj = numpy.zeros(sum_granu,dtype='float32')
        splitting_vec_j = numpy.array([],dtype='int32')
        for k in range(len(granulationwin)):
            splitting_vec_j = numpy.append(splitting_vec_j,splitting(pos[i][j][1]-pos[i][j][0],granulationwin[k],pos[i][j][3]))
        splitting_vec_i.append(numpy.asarray(splitting_vec_j))
        itemveci.append(itemvecj)       
    Itemvecs.append(numpy.asarray(itemveci))
    splitting_vec.append(numpy.asarray(splitting_vec_i))


#connect to data file
a = open(processing_file)

npointer = 3*(10 ** 6)
gpointer = granulationpointer(pos,npointer)
lengthgpointer = list()
for i in range(len(poschr)):
    lengthgpointer.append(len(gpointer[i]))
lengthgpointer = numpy.asarray(lengthgpointer)

breaklength = 100000
lastchr = '0'
lastpos = -1
oldperformchr = '0'
itemposchrindex = 0
lengpointer = 0
itemposperform = 0
gpointervec = 0
currentitemvecpos = 0
current_splitting_vec = 0

#what comes now is in essence:
#go through the processing file
#collect several (at most breaklength) values in a certain region
#use gpointer resp. gpointervec to figure out the potential TSS/TTS/... that may intersect with these enrichment calls
#if there is an overlap map them to the respective vector in Itemvecs
n=0
scorevec_start = -1
scorevec_end = -1
scorevec_chr = '0'
scorevec_chr_proc = 0
scorevec_proc = 0
scorevec = list()
scorevec_proc = numpy.asarray([])
processing_file_type = list()
processing_file_frequency = list()
processing_file_firstocurrence = list()

for line in a:
    n += 1
    z_line = line.split()
    
    if z_line[0] in processing_file_type:
        processing_file_frequency[processing_file_type.index(z_line[0])] = processing_file_frequency[processing_file_type.index(z_line[0])]+1
    else:
        processing_file_type.append(z_line[0])
        processing_file_firstocurrence.append(n)
        processing_file_frequency.append(1)
    if 'chr' not in z_line[0]:
        continue
    region = z_line[0]
    position_start = int(z_line[1])
    position_end = int(z_line[2])
    value = float(z_line[3])
    if scorevec_chr != region:
        scorevec_chr_proc = scorevec_chr
        scorevec_chr = region
        scorevec_proc = numpy.asarray(scorevec)
        scorevec = list()
        if (position_start +1)==position_end:
            scorevec.append(value)
        else:
            scorevec.extend((position_end-position_start)*[value])
        posvec = numpy.arange(scorevec_start,scorevec_end)
        scorevec_start = position_start
        scorevec_end = position_end
    else:
        if position_start>=scorevec_end:
            scorevec.extend(numpy.zeros(position_start-scorevec_end,dtype='float32'))
            if (position_start +1)==position_end:
                scorevec.append(value)
            else:
                scorevec.extend((position_end-position_start)*[value])
            scorevec_end = position_end
        else:
            scorevec_chr_proc = scorevec_chr
            scorevec_chr = region
            scorevec_proc = numpy.asarray(scorevec)
            scorevec = list()
            if (position_start +1)==position_end:
                scorevec.append(value)
            else:
                scorevec.extend((position_end-position_start)*[value])
            posvec = numpy.arange(scorevec_start,scorevec_end)
            scorevec_start = position_start
            scorevec_end = position_end            
    
    if len(scorevec)>=breaklength:
        scorevec_chr_proc = scorevec_chr
        scorevec_chr = scorevec_chr
        scorevec_proc = numpy.asarray(scorevec)
        scorevec = list()
        posvec = numpy.arange(scorevec_start,scorevec_end)
        scorevec_start = scorevec_end
        scorevec_end = scorevec_end    
        
        
    if len(scorevec_proc)>0:
        maxval = posvec[-1]+halfwinwidth+1
        minval = max(0,posvec[0]-halfwinwidth-1)
        nmax = (maxval+((npointer-maxval)%npointer))/npointer
        nmin = (minval - (minval%npointer))/npointer
        if oldperformchr != scorevec_chr_proc:
            itemposchrindex = poschr.index(scorevec_chr_proc)
            lengpointer = lengthgpointer[itemposchrindex]
            oldperformchr = scorevec_chr_proc
            itemposperform = pos[itemposchrindex]
            gpointervec = gpointer[itemposchrindex]
            currentitemvecpos = pos[itemposchrindex]
            current_splitting_vec = splitting_vec[itemposchrindex]
        if lengpointer==1:
            possibleitemvecpos = currentitemvecpos
            possible_splitting_vec = current_splitting_vec
            start=0
        else:
            if nmax >= lengpointer:
                if nmin >= lengpointer:
                    possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))]
                    possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))]
                else:
                    possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))]
                    possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))]                
            else:
                possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)]
                possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)]
            if nmin >= lengpointer:
                start = gpointervec[-1]
            else:     
                start = gpointervec[nmin]
        minval1 = posvec[0]
        maxval1 = posvec[-1]
        
        for position in range(len(possibleitemvecpos)):
            if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1:
                scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0])
                start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0])
                start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0])
                end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1
                end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem
                scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc]
                scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem))
                scorevecitem_add = numpy.zeros(sum_granu)
                for granulationwin_index in range(len(granulationwin)):
                    for reductionindex in range(granulationwin[granulationwin_index]):
                        scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]]
                        #scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex]= sum(scorevecitem[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]:possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]) 
                Itemvecs[itemposchrindex][start] += scorevecitem_add   
            start += 1             
        scorevec_proc = numpy.array([])

#since the very last calls of the for loop may not be used in the mapping process we essentially use the main part of the for loop again
scorevec_chr_proc = scorevec_chr
scorevec_proc = numpy.asarray(scorevec)
posvec = numpy.arange(scorevec_start,scorevec_end)   
       

if len(scorevec_proc)>0:  
    maxval = posvec[-1]+halfwinwidth+1
    minval = max(0,posvec[0]-halfwinwidth-1)
    nmax = (maxval+((npointer-maxval)%npointer))/npointer
    nmin = (minval - (minval%npointer))/npointer
    if oldperformchr != scorevec_chr_proc:
        itemposchrindex = poschr.index(scorevec_chr_proc)
        lengpointer = lengthgpointer[itemposchrindex]
        oldperformchr = scorevec_chr_proc
        itemposperform = pos[itemposchrindex]
        gpointervec = gpointer[itemposchrindex]
        currentitemvecpos = pos[itemposchrindex]
        current_splitting_vec = splitting_vec[itemposchrindex]
    if lengpointer==1:
        possibleitemvecpos = currentitemvecpos
        possible_splitting_vec = current_splitting_vec
        start=0
    else:
        if nmax >= lengpointer:
            if nmin >= lengpointer:
                possibleitemvecpos = currentitemvecpos[gpointervec[-1]:(len(itemposperform))]
                possible_splitting_vec = current_splitting_vec[gpointervec[-1]:(len(itemposperform))]
            else:
                possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(len(itemposperform))]
                possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(len(itemposperform))]                
        else:
            possibleitemvecpos = currentitemvecpos[gpointervec[nmin]:(gpointervec[nmax]+1)]
            possible_splitting_vec = current_splitting_vec[gpointervec[nmin]:(gpointervec[nmax]+1)]
        if nmin >= lengpointer:
            start = gpointervec[-1]
        else:     
            start = gpointervec[nmin]
    minval1 = posvec[0]
    maxval1 = posvec[-1]
    for position in range(len(possibleitemvecpos)):
        if possibleitemvecpos[position][1]>=minval1 and possibleitemvecpos[position][0]<=maxval1:
            scorevecitem = numpy.zeros(possibleitemvecpos[position][1]-possibleitemvecpos[position][0])
            start_scorevecitem = max(0,minval1-possibleitemvecpos[position][0])
            start_scorevec_proc = max(0,possibleitemvecpos[position][0]-posvec[0])
            end_scorevec_proc = min(possibleitemvecpos[position][1]-1-posvec[0],posvec[-1]-posvec[0])+1
            end_scorevecitem = end_scorevec_proc-start_scorevec_proc+start_scorevecitem
            scorevecitem[start_scorevecitem:end_scorevecitem] = scorevec_proc[start_scorevec_proc:end_scorevec_proc]
            scorevecitem_add = numpy.zeros(sum_granu)
            scorevecitem_cum_sum = numpy.append(0,numpy.cumsum(scorevecitem))
            for granulationwin_index in range(len(granulationwin)):
                for reductionindex in range(granulationwin[granulationwin_index]):
                    scorevecitem_add[granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]]-scorevecitem_cum_sum[possible_splitting_vec[position][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]]
            Itemvecs[itemposchrindex][start] += scorevecitem_add   
        start += 1                  
    scorevec_proc = numpy.array([])
    

for i in range(len(poschr)):
    for j in range(len(pos[i])):
        for granulationwin_index in range(len(granulationwin)):
            for reductionindex in range(granulationwin[granulationwin_index]):
                scaling_factor = splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex+1]-splitting_vec[i][j][granulationwin_cum_sum_extended[granulationwin_index]+granulationwin_index+reductionindex]
                if scaling_factor!=0:    
                    Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex] = (Itemvecs[i][j][granulationwin_cum_sum_extended[granulationwin_index]+reductionindex])/scaling_factor


#write the enrichments into a csv file
#store the average profiles for TSS/TTS/Transcript regions and different granulations in a separate csv file 
import csv
csv_open_profile = open(sysimport_profile+'.csv','w')
csv_writer_profile = csv.writer(csv_open_profile)
csv_list_profile = list()

for k in range(len(granulationwin)):
    csv_open_TSS = open(sysimport_csv+'_TSS_'+str(granulationwin[k])+'.csv','w')
    csv_writer_TSS = csv.writer(csv_open_TSS)
    csv_list_TSS = list()

    csv_open_TTS = open(sysimport_csv+'_TTS_'+str(granulationwin[k])+'.csv','w')
    csv_writer_TTS = csv.writer(csv_open_TTS)
    csv_list_TTS = list()
    
    csv_open_Transcript = open(sysimport_csv+'_Transcript_'+str(granulationwin[k])+'.csv','w')
    csv_writer_Transcript = csv.writer(csv_open_Transcript)
    csv_list_Transcript = list()
    
    profile_TSS = numpy.zeros(granulationwin[k])
    profile_TTS = numpy.zeros(granulationwin[k])
    profile_Transcript = numpy.zeros(granulationwin[k])
    
    headline = list()
    headline.append("Gene_ID")
    for i in range(granulationwin[k]):
        intervalname = 'Interval '+str(i+1)
        headline.append(intervalname)
    csv_list_TSS.append(headline)
    csv_list_TTS.append(headline)
    csv_list_Transcript.append(headline)

    for i in range(len(poschr)):
        if poschr[i] not in chr_not_to_be_stored:
            for j in range(len(pos[i])):
                newline = list()
                newline.append(pos[i][j][4])
                extend_line = Itemvecs[i][j][granulationwin_cum_sum_extended[k]:granulationwin_cum_sum_extended[k+1]]
                if (len(extend_line)>1) and pos[i][j][3]=='-':
                    extend_line = extend_line[::-1]
                newline.extend(extend_line)
                if pos[i][j][2]=='TSS':
                    csv_list_TSS.append(newline)
                    profile_TSS += extend_line
                if pos[i][j][2]=='TTS':
                    csv_list_TTS.append(newline)
                    profile_TTS += extend_line
                if pos[i][j][2]=='Transcript':
                    csv_list_Transcript.append(newline)
                    profile_Transcript += extend_line
        
    csv_writer_TSS.writerows(csv_list_TSS)
    csv_writer_TTS.writerows(csv_list_TTS)
    csv_writer_Transcript.writerows(csv_list_Transcript)
    
    csv_open_TSS.close()
    csv_open_TTS.close()
    csv_open_Transcript.close()
    
    TSS_profile = list()
    TSS_profile.append('TSS')
    TSS_profile.append(granulationwin[k])
    TSS_profile.extend(profile_TSS)
    csv_list_profile.append(TSS_profile)
    
    TTS_profile = list()
    TTS_profile.append('TTS')
    TTS_profile.append(granulationwin[k])
    TTS_profile.extend(profile_TTS)
    csv_list_profile.append(TTS_profile)
        
    Transcript_profile = list()
    Transcript_profile.append('Transcript')
    Transcript_profile.append(granulationwin[k])
    Transcript_profile.extend(profile_Transcript)
    csv_list_profile.append(Transcript_profile)
    
csv_writer_profile.writerows(csv_list_profile)
csv_open_profile.close()    


#optional
if len(sys.argv)>= 10:
    #check whether the processing file is of such kind such that our processing makes sense...
    #and store the result in sysimport_correct_type 
    valid = True
    for i in range(len(processing_file_type)):
        if 'chr' not in processing_file_type[i]:
            valid = False
            
    f=open(sysimport_correct_type, "a")
    if valid == True:
        f.write('Is ok for '+processing_file+'\n')
    if valid == False:
        f.write('Not ok for '+processing_file+'\n')
    f.close()