#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program.  If not, see <http://www.gnu.org/licenses/>


#The function CV_ODE_prediction predicts the enrichment changes at a target time point (predicted enrichments at target time point - measured enrichments at start time point) 
#by using k-fold cross-validation on the loci
CV_ODE_prediction_change = function(start_data,target_data,k=10,ODE_mode = "without intercept",marks = NULL){
  #start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names
  #target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data
  
  #optional parameters:
  #k (Deftault: 10) for specifying the k-fold CV
  #ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept")
  #or not, i.e., general ("with intercept")
  #marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data
  #if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks
  
  #OUTPUT:
  #a dataframe where the number of columns equals the number of predicted marks and the (i,j)-th entry is the predicted enrichment change of mark j at bin i
  
  
  marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data))
  if(is.null(marks)){
    marks_to_consider = marks_in_both_data_frames
  }else{
    marks_to_consider = intersect(marks_in_both_data_frames,marks)
  }
  
  X_k_fold = split(sample(1:nrow(start_data)),1:k)
  
  predictions = matrix(0,nrow(start_data),length(marks_to_consider))
  colnames(predictions) = marks_to_consider
  for(mark in marks_to_consider){
    for(k_fold in 1:k){
      if(ODE_mode == "with intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~., data = start_data[-X_k_fold[[k_fold]],])
      if(ODE_mode == "without intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~.-1, data = start_data[-X_k_fold[[k_fold]],])
      predictions[X_k_fold[[k_fold]],mark] = predict(a,start_data[X_k_fold[[k_fold]],])
    }
  }
  
  #those enrichments that are below 0 or above 1 are set to 0 or 1, respectively
  predictions[which(predictions<0)] = 0
  predictions[which(predictions>1)] = 1
  predictions = data.frame(predictions)
  colnames(predictions) = marks_to_consider
  rownames(predictions) = rownames(start_data)
  
  #passing from absolute enrichment predictions to enrichment prediction changes
  predictions = predictions-start_data[,colnames(predictions)]
  return(predictions)
}


#The function paste_processed_data pastes data of different epigeentic marks at one (!) time point together in one data frame
paste_processed_data = function(paths,mark_names,quantile_cutoff=0.01){
  #paths is the vector of paths to the RData files generated by Enrichments_per_bin_processing.R, so it might look like
  #c("/home/test.RData","/home/test2.RData")
  #mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so
  #if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test.RData" corresponds to H3K4me3 an so on
  
  #optional parameters:
  #quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1,
  #the bottom 1% are set to 0 and the rest is scaled linearly. 
  #If in the unusual cases where the cutoff for the top quantile_cutoff*100% and bottom quantile_cutoff*100% is identical, we set all data for that mark to 0.
  
  #OUTPUT:
  #a data frame, where the (i,j)-th entry corresponds to the enrichment at bin i of mark j, the column names is the vector mark_names,
  #the row names is the bin names of the loaded csv files
  for(i in 1:length(paths)){
    load(paths[i])
    if(i==1){
      pasted_data = matrix(0,nrow(dataset),length(paths))
      rownames_pasted_data = names(dataset)
    }
    q1=quantile(dataset,quantile_cutoff)
    q2=quantile(dataset,1-quantile_cutoff)
    dataset[dataset<q1]=q1
    dataset[dataset>q2]=q2
    dataset = dataset-q1
    if(q2>q1){
      dataset = dataset/(q2-q1)
    }else{
      dataset = rep(0,length(dataset))
    }
    pasted_data[,i] = dataset
  }
  pasted_data = data.frame(pasted_data)
  rownames(pasted_data) = rownames_pasted_data
  colnames(pasted_data) = mark_names
  return(pasted_data)
}


#The function ode_exponential_matrix_rows returns a list of vectors where to each mark (from the parameter marks)
#we do have a vector in that list describing the linear relationship between all data in start_data and that particular mark in target_data
ode_exponential_matrix_rows = function(start_data,target_data,ODE_mode = "without intercept",marks = NULL){
  #start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names
  #target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data
  
  #optional parameters:
  #ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept")
  #or not, i.e., general ("with intercept")
  #marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data
  #if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks
  
  #OUTPUT:
  #a list where to each considered mark there is an entry which is a vector corresponding the linear weights describing the enrichment of that mark
  #in target_data by a linear model of all marks in start_data
  
  
  marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data))
  if(is.null(marks)){
    marks_to_consider = marks_in_both_data_frames
  }else{
    marks_to_consider = intersect(marks_in_both_data_frames,marks)
  }
  
  exponential_matrix_row_list = list()
  for(mark in marks_to_consider){
    if(ODE_mode == "with intercept"){
      a = lm(formula=target_data[,mark]~., data = start_data[,])
      v_add = a[[1]]
      names(v_add) = c("(Intercept)",colnames(start_data))
    }
    if(ODE_mode == "without intercept"){
      a = lm(formula=target_data[,mark]~.-1, data = start_data[,])
      v_add = a[[1]]
      names(v_add) = colnames(start_data)
    }
    exponential_matrix_row_list[[mark]] = v_add
  }
  
  return(exponential_matrix_row_list)
}


#The function ode_logarithm_matrix reconstructs the exponential_matrix from exponential_matrix_rows_list and
#returns the matrix logarithm on all marks that are present in both start_data and target_data (corresponds to the matrix A in dx/dt = A*x) in the homogeneous case.
#In the not necessarily homogeneous case we reconstruct A and b for matching dx/dt = A*x+b

#IMPORTANT: ode_exponential_matrix_rows should have been called beforehand for all marks with ODE_mode = "with intercept" (not necessarily homoegenous case)
#or ODE_mode = "without intercept"  (homogeneous case), and without come mixture (the algorithm detects automatically which case it is)
ode_logarithm_matrix = function(exponential_matrix_rows_list,start_time_point,end_time_point,temporal_path){
  #exponential_matrix_rows_list output from ode_exponential_matrix_rows with values for all marks that are present in both start_data and target_data
  #if ode_exponential_matrix_rows has been called for subsets of marks, then these must be brought together in one list 
  #start_time_point is the start time point (can be in hours, minutes, etc.), e.g., 10 (corresponding to minutes)
  #end_time_point is the end time point (units must be consistent with start_time_point)
  #temporal_path is the path to a file name where we can store a csv file for a short time (the path has to be existent, not the file itself), 
  #e.g. "/home/intermediate.csv" or "/home/temp/intermediate.csv" etc.
  
  #OUTPUT:
  #in the homogeneous case a list with A corresponding to the matrix in dx/dt = A*x+b
  #in the not necessarily homogeneous case a list with A and b corresponding to the matrix and vector in dx/dt = A*x+b

  
  #reconstructing the exponential matrix by pasting together lines
  
  B = matrix(0,length(exponential_matrix_rows_list),length(exponential_matrix_rows_list))
  colnames(B) = names(exponential_matrix_rows_list)
  rownames(B) = names(exponential_matrix_rows_list)
  
  #c is the constant terms in the expoential fit, if used
  c = rep(0,length(exponential_matrix_rows_list))
  use_c = FALSE
  for(i in 1:length(exponential_matrix_rows_list)){
    B[i,] = exponential_matrix_rows_list[[i]][colnames(B)]
    if("(Intercept)" %in% names(exponential_matrix_rows_list[[i]])){
      use_c = TRUE
      c[i] = exponential_matrix_rows_list[[i]]["(Intercept)"]
    }
  }
  
  #if a not necessarily homogeneous system is assumed, we paste it to B to obtain the following matrix
  # B c
  # 0 1
  #and apply the matrix algorithm to it instead of the origrinal B alone, then afterwards we get out A and b from the logarithmic matrix
  #which has the form
  # A b
  # 0 0
  if(use_c){
    B = cbind(B,c)
    B = rbind(B,rep(0,length(exponential_matrix_rows_list)+1))
    B[length(exponential_matrix_rows_list)+1,length(exponential_matrix_rows_list)+1] = 1
    colnames(B) = c(names(exponential_matrix_rows_list),"constant")
    rownames(B) = c(names(exponential_matrix_rows_list),"constant")
  }
  
  #write matrix exponential for further processing matlab to csv file
  write.table(B,col.names = FALSE,row.names = FALSE, file = temporal_path, sep =",")
  
  #call matlab for matrix logarithm logm (can handle complex values which might arise)
  system_string = paste0('matlab -nodisplay -r "',"expA = csvread('",temporal_path,"',0,0);A = logm(expA);","A=A/(",end_time_point-start_time_point,");",
                         "csvwrite('",temporal_path,"',A);",'exit"')
  system(system_string)
  #read the matrix logarithm into matlab again
  A = read.csv(temporal_path,header = FALSE)
  #delete intermediate file
  unlink(temporal_path)
  
  #the matrix might be complex in case complex values are present, normally it is numeric
  A = as.matrix(A)
  
  if(!use_c){
    colnames(A) = names(exponential_matrix_rows_list)
    rownames(A) = names(exponential_matrix_rows_list)
    output_list = list("A"=A)
  }else{
    #in the not necessarily homogeneous case, A and b is reconstructed as outlined above
    b = A[1:(nrow(A)-1),ncol(A)]
    A = A[1:(nrow(A)-1),1:(ncol(A)-1)]
    colnames(A) = names(exponential_matrix_rows_list)
    rownames(A) = names(exponential_matrix_rows_list)
    names(b) =names(exponential_matrix_rows_list)
    output_list = list("A"=A,"b"=b)
  }
  return(output_list)
}