#Copyright (C) 2016, Author: Tobias Ahsendorf, E-Mail: tobias.ahsendorf@gmail.com # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see #The function CV_ODE_prediction predicts the enrichment changes at a target time point (predicted enrichments at target time point - measured enrichments at start time point) #by using k-fold cross-validation on the loci CV_ODE_prediction_change = function(start_data,target_data,k=10,ODE_mode = "without intercept",marks = NULL){ #start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names #target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data #optional parameters: #k (Deftault: 10) for specifying the k-fold CV #ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept") #or not, i.e., general ("with intercept") #marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data #if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks #OUTPUT: #a dataframe where the number of columns equals the number of predicted marks and the (i,j)-th entry is the predicted enrichment change of mark j at bin i marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data)) if(is.null(marks)){ marks_to_consider = marks_in_both_data_frames }else{ marks_to_consider = intersect(marks_in_both_data_frames,marks) } X_k_fold = split(sample(1:nrow(start_data)),1:k) predictions = matrix(0,nrow(start_data),length(marks_to_consider)) colnames(predictions) = marks_to_consider for(mark in marks_to_consider){ for(k_fold in 1:k){ if(ODE_mode == "with intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~., data = start_data[-X_k_fold[[k_fold]],]) if(ODE_mode == "without intercept") a = lm(formula=target_data[-X_k_fold[[k_fold]],mark]~.-1, data = start_data[-X_k_fold[[k_fold]],]) predictions[X_k_fold[[k_fold]],mark] = predict(a,start_data[X_k_fold[[k_fold]],]) } } #those enrichments that are below 0 or above 1 are set to 0 or 1, respectively predictions[which(predictions<0)] = 0 predictions[which(predictions>1)] = 1 predictions = data.frame(predictions) colnames(predictions) = marks_to_consider rownames(predictions) = rownames(start_data) #passing from absolute enrichment predictions to enrichment prediction changes predictions = predictions-start_data[,colnames(predictions)] return(predictions) } #The function paste_processed_data pastes data of different epigeentic marks at one (!) time point together in one data frame paste_processed_data = function(paths,mark_names,quantile_cutoff=0.01){ #paths is the vector of paths to the RData files generated by Enrichments_per_bin_processing.R, so it might look like #c("/home/test.RData","/home/test2.RData") #mark_names is the vector of names of the marks at the respective positions and hence has to have the same lengths of the variable paths, so #if it looks like c("H3K4me3","H3K27me3") and the paths variable is as above, then "/home/test.RData" corresponds to H3K4me3 an so on #optional parameters: #quantile_cutoff (Deftault:0.01) is the quatile cutoff data. If for instnace it is 0.01, then the top 1% of the enrichments of each mark are set to 1, #the bottom 1% are set to 0 and the rest is scaled linearly. #If in the unusual cases where the cutoff for the top quantile_cutoff*100% and bottom quantile_cutoff*100% is identical, we set all data for that mark to 0. #OUTPUT: #a data frame, where the (i,j)-th entry corresponds to the enrichment at bin i of mark j, the column names is the vector mark_names, #the row names is the bin names of the loaded csv files for(i in 1:length(paths)){ load(paths[i]) if(i==1){ pasted_data = matrix(0,nrow(dataset),length(paths)) rownames_pasted_data = names(dataset) } q1=quantile(dataset,quantile_cutoff) q2=quantile(dataset,1-quantile_cutoff) dataset[datasetq2]=q2 dataset = dataset-q1 if(q2>q1){ dataset = dataset/(q2-q1) }else{ dataset = rep(0,length(dataset)) } pasted_data[,i] = dataset } pasted_data = data.frame(pasted_data) rownames(pasted_data) = rownames_pasted_data colnames(pasted_data) = mark_names return(pasted_data) } #The function ode_exponential_matrix_rows returns a list of vectors where to each mark (from the parameter marks) #we do have a vector in that list describing the linear relationship between all data in start_data and that particular mark in target_data ode_exponential_matrix_rows = function(start_data,target_data,ODE_mode = "without intercept",marks = NULL){ #start_data is a data frame, where the (i,j)-th entry is the enrichment at bin i and mark j, colnames must be mark names #target_data is analogous to start_data, with the condition the i-th row corresponds to the same bin as the i-th row in start_data #optional parameters: #ODE_mode (Default: "without intercept") for specifying whether or not the system of linear ODEs of first order model should be homogeneous ("without intercept") #or not, i.e., general ("with intercept") #marks (Default: NULL) to specify for which marks enrichments should be predicted; if it is NULL we predict all marks that are both in the start_data and target_data #if marks is something like c("H3K4me3","H3K27me3") then we will predict all marks that are in the start_data, the target_data and in the vector marks #OUTPUT: #a list where to each considered mark there is an entry which is a vector corresponding the linear weights describing the enrichment of that mark #in target_data by a linear model of all marks in start_data marks_in_both_data_frames = intersect(colnames(start_data),colnames(target_data)) if(is.null(marks)){ marks_to_consider = marks_in_both_data_frames }else{ marks_to_consider = intersect(marks_in_both_data_frames,marks) } exponential_matrix_row_list = list() for(mark in marks_to_consider){ if(ODE_mode == "with intercept"){ a = lm(formula=target_data[,mark]~., data = start_data[,]) v_add = a[[1]] names(v_add) = c("(Intercept)",colnames(start_data)) } if(ODE_mode == "without intercept"){ a = lm(formula=target_data[,mark]~.-1, data = start_data[,]) v_add = a[[1]] names(v_add) = colnames(start_data) } exponential_matrix_row_list[[mark]] = v_add } return(exponential_matrix_row_list) } #The function ode_logarithm_matrix reconstructs the exponential_matrix from exponential_matrix_rows_list and #returns the matrix logarithm on all marks that are present in both start_data and target_data (corresponds to the matrix A in dx/dt = A*x) in the homogeneous case. #In the not necessarily homogeneous case we reconstruct A and b for matching dx/dt = A*x+b #IMPORTANT: ode_exponential_matrix_rows should have been called beforehand for all marks with ODE_mode = "with intercept" (not necessarily homoegenous case) #or ODE_mode = "without intercept" (homogeneous case), and without come mixture (the algorithm detects automatically which case it is) ode_logarithm_matrix = function(exponential_matrix_rows_list,start_time_point,end_time_point,temporal_path){ #exponential_matrix_rows_list output from ode_exponential_matrix_rows with values for all marks that are present in both start_data and target_data #if ode_exponential_matrix_rows has been called for subsets of marks, then these must be brought together in one list #start_time_point is the start time point (can be in hours, minutes, etc.), e.g., 10 (corresponding to minutes) #end_time_point is the end time point (units must be consistent with start_time_point) #temporal_path is the path to a file name where we can store a csv file for a short time (the path has to be existent, not the file itself), #e.g. "/home/intermediate.csv" or "/home/temp/intermediate.csv" etc. #OUTPUT: #in the homogeneous case a list with A corresponding to the matrix in dx/dt = A*x+b #in the not necessarily homogeneous case a list with A and b corresponding to the matrix and vector in dx/dt = A*x+b #reconstructing the exponential matrix by pasting together lines B = matrix(0,length(exponential_matrix_rows_list),length(exponential_matrix_rows_list)) colnames(B) = names(exponential_matrix_rows_list) rownames(B) = names(exponential_matrix_rows_list) #c is the constant terms in the expoential fit, if used c = rep(0,length(exponential_matrix_rows_list)) use_c = FALSE for(i in 1:length(exponential_matrix_rows_list)){ B[i,] = exponential_matrix_rows_list[[i]][colnames(B)] if("(Intercept)" %in% names(exponential_matrix_rows_list[[i]])){ use_c = TRUE c[i] = exponential_matrix_rows_list[[i]]["(Intercept)"] } } #if a not necessarily homogeneous system is assumed, we paste it to B to obtain the following matrix # B c # 0 1 #and apply the matrix algorithm to it instead of the origrinal B alone, then afterwards we get out A and b from the logarithmic matrix #which has the form # A b # 0 0 if(use_c){ B = cbind(B,c) B = rbind(B,rep(0,length(exponential_matrix_rows_list)+1)) B[length(exponential_matrix_rows_list)+1,length(exponential_matrix_rows_list)+1] = 1 colnames(B) = c(names(exponential_matrix_rows_list),"constant") rownames(B) = c(names(exponential_matrix_rows_list),"constant") } #write matrix exponential for further processing matlab to csv file write.table(B,col.names = FALSE,row.names = FALSE, file = temporal_path, sep =",") #call matlab for matrix logarithm logm (can handle complex values which might arise) system_string = paste0('matlab -nodisplay -r "',"expA = csvread('",temporal_path,"',0,0);A = logm(expA);","A=A/(",end_time_point-start_time_point,");", "csvwrite('",temporal_path,"',A);",'exit"') system(system_string) #read the matrix logarithm into matlab again A = read.csv(temporal_path,header = FALSE) #delete intermediate file unlink(temporal_path) #the matrix might be complex in case complex values are present, normally it is numeric A = as.matrix(A) if(!use_c){ colnames(A) = names(exponential_matrix_rows_list) rownames(A) = names(exponential_matrix_rows_list) output_list = list("A"=A) }else{ #in the not necessarily homogeneous case, A and b is reconstructed as outlined above b = A[1:(nrow(A)-1),ncol(A)] A = A[1:(nrow(A)-1),1:(ncol(A)-1)] colnames(A) = names(exponential_matrix_rows_list) rownames(A) = names(exponential_matrix_rows_list) names(b) =names(exponential_matrix_rows_list) output_list = list("A"=A,"b"=b) } return(output_list) }