-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some further questions: - [ ] should we delete temp files after processing? - [ ] is using the replicate ids the right way to go? Some TODOs: - [ ] documentation for dependency management (probably a requirements.txt and a bash script to install using pip) Addresses issue #11
- Loading branch information
1 parent
0110646
commit 2e5bb9e
Showing
13 changed files
with
347 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,9 @@ | |
.Rhistory | ||
.RData | ||
.Ruserdata | ||
|
||
# temporary files for GPs | ||
python/temp* | ||
|
||
# Python | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,4 +19,5 @@ Imports: | |
ggplot2, | ||
ComplexHeatmap, | ||
reshape2, | ||
grDevices | ||
grDevices, | ||
rPython |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
##------------------------------------------------------------------------------------------ | ||
##------------------------------------------------------------------------------------------ | ||
#' Fit a GP using Python's GPy package. | ||
#' | ||
#' \code{fitGP} fit a GP using Python's GPy package | ||
#' | ||
#' @param replicates A vector of multiple replicates that user would like to fit GP on | ||
#' @param source Data source | ||
#' | ||
#' @return results A list of time, predicted mean, and variance of fit GP | ||
#' | ||
#' @examples | ||
#' fitGP(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3", "PHLC111_P7.706.B1", "PHLC111_P7.708.B3"), lpdx) | ||
#' fitGP(c("PHLC111_P7.702.A2", "PHLC111_P7.704.A4", "PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) | ||
#' @export | ||
fitGP <- function(replicates, source) { | ||
times <- c() | ||
volumes <- c() | ||
|
||
max_time_index <- 0 | ||
max_volume_index <- 0 | ||
|
||
for (i in 1:length(replicates)) { | ||
df <- getExperiment(source, replicates[i]) | ||
times[[i]] <- df$time | ||
volumes[[i]] <- df$volume | ||
|
||
if (length(df$time) > max_time_index) { | ||
max_time_index <- length(df$time) | ||
} | ||
|
||
if (length(df$volume) > max_volume_index) { | ||
max_volume_index <- length(df$volume) | ||
} | ||
} | ||
|
||
# reformat times and volumes | ||
for (i in 1:length(times)) { | ||
times[[i]] <- c(times[[i]], rep(NA, max_time_index - length(times[[i]]))) | ||
volumes[[i]] <- c(volumes[[i]], rep(NA, max_volume_index - length(volumes[[i]]))) | ||
} | ||
|
||
# write all to tempfiles | ||
write.table(data.frame(lapply(t(times), as.numeric)), | ||
file="python/temptime.csv", row.names = FALSE, col.names = FALSE) | ||
write.table(data.frame(lapply(volumes, as.numeric), stringsAsFactors = FALSE), | ||
file="python/tempvolume.csv", row.names = FALSE, col.names = FALSE) | ||
|
||
# run python script | ||
system("python3 python/fit_single_gp.py") | ||
|
||
# get results | ||
results = read.csv("python/tempresults.csv", sep = " ") | ||
|
||
return(results) | ||
} | ||
|
||
##------------------------------------------------------------------------------------------ | ||
##------------------------------------------------------------------------------------------ | ||
#' Get KL divergence of two GPs | ||
#' | ||
#' \code{getGPStatistics} Get the KL divergence of two GPs. | ||
#' | ||
#' @param controlReplicates Vector of control replicates | ||
#' @param caseReplicates Vector of case replicates | ||
#' @param source Data source of replicates | ||
#' | ||
#' @return Returns the KL divergence value for two GPs. | ||
#' | ||
#' @examples | ||
#' getGPStatistics(c("PHLC111_P7.701.A1", "PHLC111_P7.703.A3"), c("PHLC111_P7.705.A5", "PHLC111_P7.707.B2"), lpdx) | ||
#' | ||
#' @export | ||
getGPStatistics <- function(controlReplicates, caseReplicates, source) { | ||
control_times <- c() | ||
case_times <- c() | ||
|
||
control_volumes <- c() | ||
case_volumes <- c() | ||
|
||
max_time_index <- 0 | ||
max_volume_index <- 0 | ||
|
||
for (i in 1:length(controlReplicates)) { | ||
df <- getExperiment(source, controlReplicates[i]) | ||
control_times[[i]] <- df$time | ||
control_volumes[[i]] <- df$volume | ||
|
||
if (length(df$time) > max_time_index) { | ||
max_time_index <- length(df$time) | ||
} | ||
|
||
if (length(df$volume) > max_volume_index) { | ||
max_volume_index <- length(df$volume) | ||
} | ||
} | ||
|
||
for (i in 1:length(caseReplicates)) { | ||
df <- getExperiment(source, caseReplicates[i]) | ||
case_times[[i]] <- df$time | ||
case_volumes[[i]] <- df$volume | ||
|
||
if (length(df$time) > max_time_index) { | ||
max_time_index <- length(df$time) | ||
} | ||
|
||
if (length(df$volume) > max_volume_index) { | ||
max_volume_index <- length(df$volume) | ||
} | ||
} | ||
|
||
# reformat times and volumes | ||
for (i in 1:length(control_times)) { | ||
control_times[[i]] <- c(control_times[[i]], rep(NA, max_time_index - length(control_times[[i]]))) | ||
control_volumes[[i]] <- c(control_volumes[[i]], rep(NA, max_volume_index - length(control_volumes[[i]]))) | ||
} | ||
|
||
# reformat times and volumes | ||
for (i in 1:length(case_times)) { | ||
case_times[[i]] <- c(case_times[[i]], rep(NA, max_time_index - length(case_times[[i]]))) | ||
case_volumes[[i]] <- c(case_volumes[[i]], rep(NA, max_volume_index - length(case_volumes[[i]]))) | ||
} | ||
|
||
# write all to tempfiles | ||
write.table(data.frame(lapply(t(control_times), as.numeric)), | ||
file="python/temp_control_time.csv", row.names = FALSE, col.names = FALSE) | ||
write.table(data.frame(lapply(control_volumes, as.numeric), stringsAsFactors = FALSE), | ||
file="python/temp_control_volume.csv", row.names = FALSE, col.names = FALSE) | ||
write.table(data.frame(lapply(t(case_times), as.numeric)), | ||
file="python/temp_case_time.csv", row.names = FALSE, col.names = FALSE) | ||
write.table(data.frame(lapply(case_volumes, as.numeric), stringsAsFactors = FALSE), | ||
file="python/temp_case_volume.csv", row.names = FALSE, col.names = FALSE) | ||
|
||
# run python script | ||
system("python3 python/two_gp_statistics.py") | ||
|
||
# get results | ||
results = read.csv("python/temp_statistics_results.csv", sep = " ") | ||
|
||
return(results) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Python Gaussian Processes | ||
|
||
Docs to come... |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env python | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from gp import normalize_data, fit_gaussian_process | ||
|
||
if __name__ == '__main__': | ||
|
||
# import CSV files | ||
df_time = pd.read_csv('python/temptime.csv', delimiter=" ", header=None).dropna() | ||
df_volume = pd.read_csv('python/tempvolume.csv', delimiter=" ", header=None).dropna() | ||
|
||
# Replace any zeros in volume size with something that won't error out | ||
df_volume.replace(0, 0.000001, inplace=True) | ||
x, y, y_norm = normalize_data(df_time, df_volume) | ||
|
||
fit_gp = fit_gaussian_process(x, y_norm, len(df_time.columns)) | ||
|
||
# Print out results of GP | ||
increment_by = 0.25 | ||
predict_x = np.arange(0, max(x), increment_by) | ||
|
||
with open("python/tempresults.csv", "w") as f: | ||
print("x", ",", "prediction", ",", "variance", file=f) | ||
for i in predict_x: | ||
prediction_at_i = fit_gp.predict(np.asarray([[i]])) | ||
print(i, ",", prediction_at_i[0][0][0], ",", prediction_at_i[1][0][0], file=f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from GPy.models import GPRegression | ||
from GPy.kern import RBF | ||
import numpy as np | ||
|
||
from scipy.integrate import quad | ||
from scipy.stats import norm | ||
|
||
|
||
def normalize_data(X, Y): | ||
""" | ||
Normalizes all growths using normalize_first_day_and_log_transform helper function. | ||
:param X: Pandas dataframe of times | ||
:param Y: Pandas dataframe of volumes | ||
:return: | ||
""" | ||
X = X[0] | ||
x = np.asarray([[day] for day in X]) | ||
y = np.asarray([[size for size in Y[replicate]] for replicate in Y]) | ||
y_norm = __normalize_first_day_and_log_transform(y) | ||
|
||
return x, y, y_norm | ||
|
||
def __normalize_first_day_and_log_transform(y): | ||
""" | ||
Normalize by dividing every y element-wise by the first day's median | ||
and then taking the log. | ||
:param y: | ||
:return: | ||
""" | ||
if y.ndim == 1: | ||
return np.log(y / np.median(y[0])) | ||
else: | ||
return np.log(y / np.median(y.T[0])) | ||
|
||
|
||
def fit_gaussian_process(x, y_norm, num_replicates): | ||
""" | ||
:param x: Numpy array of times | ||
:param y_norm: Numpy array of normalized tumor volumes | ||
:param num_replicates: i.e. Number of columns in DF | ||
:return: | ||
""" | ||
|
||
kernel = RBF(input_dim=1, variance=1., lengthscale=10.) | ||
|
||
x = np.tile(x, (num_replicates, 1)) | ||
y = np.resize(y_norm, (y_norm.shape[0] * y_norm.shape[1], 1)) | ||
|
||
gp = GPRegression(x, y, kernel) | ||
gp.optimize_restarts(num_restarts=9, messages=False) | ||
|
||
return gp | ||
|
||
def calculate_kl_divergence(control_x, case_x, gp_control, gp_case): | ||
""" | ||
Calculates the KL divergence between the GPs fit for both the | ||
batched controls and batched cases. | ||
:param control_x: | ||
:param case_x: | ||
:param gp_control: | ||
:param gp_case: | ||
:return: kl_divergence | ||
""" | ||
|
||
kl_divergence = None | ||
|
||
def kl_integrand(t): | ||
mean_control, var_control = gp_control.predict(np.asarray([[t]])) | ||
mean_case, var_case = gp_case.predict(np.asarray([[t]])) | ||
|
||
return np.log10(var_case / var_control) + ((var_control + (mean_control - mean_case) ** 2) / (2 * var_case)) | ||
|
||
# TODO: Need to replace zero with drug start day - this data needs to come in from Xeva | ||
if len(control_x) > len(case_x): | ||
kl_divergence = abs(quad(kl_integrand, 0, max(case_x))[0] | ||
- max(case_x) / 2)[0] | ||
else: | ||
kl_divergence = abs(quad(kl_integrand, 0, max(control_x))[0] | ||
- max(control_x) / 2)[0] | ||
|
||
return kl_divergence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
GPy==1.5.6 | ||
numpy==1.12.0 | ||
scipy==0.18.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from gp import normalize_data, fit_gaussian_process, calculate_kl_divergence | ||
|
||
if __name__ == '__main__': | ||
|
||
# import CSV files | ||
df_control_time = pd.read_csv('python/temp_control_time.csv', delimiter=" ", header=None).dropna() | ||
df_case_time = pd.read_csv('python/temp_case_time.csv', delimiter=" ", header=None).dropna() | ||
df_control_volume = pd.read_csv('python/temp_control_volume.csv', delimiter=" ", header=None).dropna() | ||
df_case_volume = pd.read_csv('python/temp_case_volume.csv', delimiter=" ", header=None).dropna() | ||
|
||
# Replace any zeros in volume size with something that won't error out | ||
df_control_volume.replace(0, 0.000001, inplace=True) | ||
df_case_volume.replace(0, 0.000001, inplace=True) | ||
|
||
control_x, control_y, control_y_norm = normalize_data(df_control_time, df_control_volume) | ||
case_x, case_y, case_y_norm = normalize_data(df_case_time, df_case_volume) | ||
|
||
control_gp = fit_gaussian_process(control_x, control_y_norm, len(df_control_time.columns)) | ||
case_gp = fit_gaussian_process(case_x, case_y_norm, len(df_case_time.columns)) | ||
|
||
kl_divergence = calculate_kl_divergence(control_x, case_x, control_gp, case_gp) | ||
|
||
with open("python/temp_statistics_results.csv", "w") as f: | ||
print("kl_divergence", file=f) | ||
print(kl_divergence, file=f) |
Binary file not shown.