#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program. If not, see
#The function CV_ODE_prediction predicts the enrichment changes at a target time point (predicted enrichments at target time point - measured enrichments at start time point)
#by using k-fold cross-validation on the loci
CV_ODE_prediction_change = function(start_data,target_data,k=10,ODE_mode = "without intercept",marks = NULL){
#start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names
#target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data
#optional parameters:
#k (Deftault: 10) for specifying the k-fold CV
#ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept")
#or not, i.e., general ("with intercept")
#marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data
#if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks
#OUTPUT:
#a dataframe where the number of columns equals the number of predicted marks and the (i,j)-th entry is the predicted enrichment change of mark j at bin i
marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data))
if(is.null(marks)){
marks_to_consider = marks_in_both_data_frames
}else{
marks_to_consider = intersect(marks_in_both_data_frames,marks)
}
X_k_fold = split(sample(1:nrow(start_data)),1:k)
predictions = matrix(0,nrow(start_data),length(marks_to_consider))
colnames(predictions) = marks_to_consider
for(mark in marks_to_consider){
for(k_fold in 1:k){
if(ODE_mode == "with intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~., data = start_data[-X_k_fold[[k_fold]],])
if(ODE_mode == "without intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~.-1, data = start_data[-X_k_fold[[k_fold]],])
predictions[X_k_fold[[k_fold]],mark] = predict(a,start_data[X_k_fold[[k_fold]],])
}
}
#those enrichments that are below 0 or above 1 are set to 0 or 1, respectively
predictions[which(predictions<0)] = 0
predictions[which(predictions>1)] = 1
predictions = data.frame(predictions)
colnames(predictions) = marks_to_consider
rownames(predictions) = rownames(start_data)
#passing from absolute enrichment predictions to enrichment prediction changes
predictions = predictions-start_data[,colnames(predictions)]
return(predictions)
}
#The function paste_processed_data pastes data of different epigeentic marks at one (!) time point together in one data frame
paste_processed_data = function(paths,mark_names,quantile_cutoff=0.01){
#paths is the vector of paths to the RData files generated by Enrichments_per_bin_processing.R, so it might look like
#c("/home/test.RData","/home/test2.RData")
#mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so
#if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test.RData" corresponds to H3K4me3 an so on
#optional parameters:
#quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1,
#the bottom 1% are set to 0 and the rest is scaled linearly.
#If in the unusual cases where the cutoff for the top quantile_cutoff*100% and bottom quantile_cutoff*100% is identical, we set all data for that mark to 0.
#OUTPUT:
#a data frame, where the (i,j)-th entry corresponds to the enrichment at bin i of mark j, the column names is the vector mark_names,
#the row names is the bin names of the loaded csv files
for(i in 1:length(paths)){
load(paths[i])
if(i==1){
pasted_data = matrix(0,nrow(dataset),length(paths))
rownames_pasted_data = names(dataset)
}
q1=quantile(dataset,quantile_cutoff)
q2=quantile(dataset,1-quantile_cutoff)
dataset[datasetq2]=q2
dataset = dataset-q1
if(q2>q1){
dataset = dataset/(q2-q1)
}else{
dataset = rep(0,length(dataset))
}
pasted_data[,i] = dataset
}
pasted_data = data.frame(pasted_data)
rownames(pasted_data) = rownames_pasted_data
colnames(pasted_data) = mark_names
return(pasted_data)
}
#The function ode_exponential_matrix_rows returns a list of vectors where to each mark (from the parameter marks)
#we do have a vector in that list describing the linear relationship between all data in start_data and that particular mark in target_data
ode_exponential_matrix_rows = function(start_data,target_data,ODE_mode = "without intercept",marks = NULL){
#start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names
#target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data
#optional parameters:
#ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept")
#or not, i.e., general ("with intercept")
#marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data
#if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks
#OUTPUT:
#a list where to each considered mark there is an entry which is a vector corresponding the linear weights describing the enrichment of that mark
#in target_data by a linear model of all marks in start_data
marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data))
if(is.null(marks)){
marks_to_consider = marks_in_both_data_frames
}else{
marks_to_consider = intersect(marks_in_both_data_frames,marks)
}
exponential_matrix_row_list = list()
for(mark in marks_to_consider){
if(ODE_mode == "with intercept"){
a = lm(formula=target_data[,mark]~., data = start_data[,])
v_add = a[[1]]
names(v_add) = c("(Intercept)",colnames(start_data))
}
if(ODE_mode == "without intercept"){
a = lm(formula=target_data[,mark]~.-1, data = start_data[,])
v_add = a[[1]]
names(v_add) = colnames(start_data)
}
exponential_matrix_row_list[[mark]] = v_add
}
return(exponential_matrix_row_list)
}
#The function ode_logarithm_matrix reconstructs the exponential_matrix from exponential_matrix_rows_list and
#returns the matrix logarithm on all marks that are present in both start_data and target_data (corresponds to the matrix A in dx/dt = A*x) in the homogeneous case.
#In the not necessarily homogeneous case we reconstruct A and b for matching dx/dt = A*x+b
#IMPORTANT: ode_exponential_matrix_rows should have been called beforehand for all marks with ODE_mode = "with intercept" (not necessarily homoegenous case)
#or ODE_mode = "without intercept" (homogeneous case), and without come mixture (the algorithm detects automatically which case it is)
ode_logarithm_matrix = function(exponential_matrix_rows_list,start_time_point,end_time_point,temporal_path){
#exponential_matrix_rows_list output from ode_exponential_matrix_rows with values for all marks that are present in both start_data and target_data
#if ode_exponential_matrix_rows has been called for subsets of marks, then these must be brought together in one list
#start_time_point is the start time point (can be in hours, minutes, etc.), e.g., 10 (corresponding to minutes)
#end_time_point is the end time point (units must be consistent with start_time_point)
#temporal_path is the path to a file name where we can store a csv file for a short time (the path has to be existent, not the file itself),
#e.g. "/home/intermediate.csv" or "/home/temp/intermediate.csv" etc.
#OUTPUT:
#in the homogeneous case a list with A corresponding to the matrix in dx/dt = A*x+b
#in the not necessarily homogeneous case a list with A and b corresponding to the matrix and vector in dx/dt = A*x+b
#reconstructing the exponential matrix by pasting together lines
B = matrix(0,length(exponential_matrix_rows_list),length(exponential_matrix_rows_list))
colnames(B) = names(exponential_matrix_rows_list)
rownames(B) = names(exponential_matrix_rows_list)
#c is the constant terms in the expoential fit, if used
c = rep(0,length(exponential_matrix_rows_list))
use_c = FALSE
for(i in 1:length(exponential_matrix_rows_list)){
B[i,] = exponential_matrix_rows_list[[i]][colnames(B)]
if("(Intercept)" %in% names(exponential_matrix_rows_list[[i]])){
use_c = TRUE
c[i] = exponential_matrix_rows_list[[i]]["(Intercept)"]
}
}
#if a not necessarily homogeneous system is assumed, we paste it to B to obtain the following matrix
# B c
# 0 1
#and apply the matrix algorithm to it instead of the origrinal B alone, then afterwards we get out A and b from the logarithmic matrix
#which has the form
# A b
# 0 0
if(use_c){
B = cbind(B,c)
B = rbind(B,rep(0,length(exponential_matrix_rows_list)+1))
B[length(exponential_matrix_rows_list)+1,length(exponential_matrix_rows_list)+1] = 1
colnames(B) = c(names(exponential_matrix_rows_list),"constant")
rownames(B) = c(names(exponential_matrix_rows_list),"constant")
}
#write matrix exponential for further processing matlab to csv file
write.table(B,col.names = FALSE,row.names = FALSE, file = temporal_path, sep =",")
#call matlab for matrix logarithm logm (can handle complex values which might arise)
system_string = paste0('matlab -nodisplay -r "',"expA = csvread('",temporal_path,"',0,0);A = logm(expA);","A=A/(",end_time_point-start_time_point,");",
"csvwrite('",temporal_path,"',A);",'exit"')
system(system_string)
#read the matrix logarithm into matlab again
A = read.csv(temporal_path,header = FALSE)
#delete intermediate file
unlink(temporal_path)
#the matrix might be complex in case complex values are present, normally it is numeric
A = as.matrix(A)
if(!use_c){
colnames(A) = names(exponential_matrix_rows_list)
rownames(A) = names(exponential_matrix_rows_list)
output_list = list("A"=A)
}else{
#in the not necessarily homogeneous case, A and b is reconstructed as outlined above
b = A[1:(nrow(A)-1),ncol(A)]
A = A[1:(nrow(A)-1),1:(ncol(A)-1)]
colnames(A) = names(exponential_matrix_rows_list)
rownames(A) = names(exponential_matrix_rows_list)
names(b) =names(exponential_matrix_rows_list)
output_list = list("A"=A,"b"=b)
}
return(output_list)
}