-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path200427_lvl5_mixall.R
88 lines (73 loc) · 2.84 KB
/
200427_lvl5_mixall.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
library(cmapR)
library(ranger)
# prepping data ---
if (exists("lvl5_data")) {
} else if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")) {
load("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")
} else {
source("lvl5_inputs.R")
}
# unbalanced ----
temp_id <- rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname %in% lig16]
trainIDs <- sample(temp_id,round(length(temp_id) / 2))
testIDs <- setdiff(temp_id,trainIDs)
# ^ training ----
rfmodel <- ranger(x=t(lvl5_data@mat[,trainIDs]),
y=as.factor(lvl5_data@cdesc[trainIDs,"pert_iname"]),
num.threads=8,
verbose=F)
# ^ testing ----
rfresults <- predict(rfmodel,t(lvl5_data@mat[,testIDs]))
save(rfmodel,rfresults,trainIDs,testIDs,
file="~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall.RData")
# balancing data ----
temp_lig_id <- sapply(lig16,function(L)
rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == L],
simplify=F)
temp_size <- floor(min(sapply(temp_lig_id,length)) / 2)
trainIDs <- sapply(temp_lig_id,function(X) sample(X,temp_size),simplify=F)
testIDs <- mapply(function(all,train) setdiff(all,train),
all=temp_lig_id,train=trainIDs)
## balancing test data:
# temp_size <- min(temp_size,sapply(testIDs,length))
# testIDs <-sapply(testIDs,function(X) sample(X,temp_size),simplify=F)
##
trainIDs <- unlist(trainIDs,use.names=F)
testIDs <- unlist(testIDs,use.names=F)
# ^ training ----
rfmodel <- ranger(x=t(lvl5_data@mat[,trainIDs]),
y=as.factor(lvl5_data@cdesc[trainIDs,"pert_iname"]),
num.threads=8,
verbose=F)
# ^ testing ----
rfresults <- predict(rfmodel,t(lvl5_data@mat[,testIDs]))
save(rfmodel,rfresults,trainIDs,testIDs,
file="~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced.RData")
# Data saturation test ----
temp_lig_id <- sapply(lig16,function(L)
rownames(lvl5_data@cdesc)[lvl5_data@cdesc$pert_iname == L],
simplify=F)
trainIDs <- sapply(
seq(1,min(sapply(temp_lig_id,length)) - 1,1),
function(N)
sapply(temp_lig_id,function(X) sample(X,N),simplify=F),
simplify=F)
testIDs <- sapply(trainIDs,function(X)
mapply(function(all,train) setdiff(all,train),
all=temp_lig_id,train=X),
simplify=F)
trainIDs <- sapply(trainIDs,unlist,use.names=F)
testIDs <- sapply(testIDs,unlist,use.names=F)
# ^ training ----
rfmodel <- sapply(seq_along(trainIDs),function(N)
ranger(x=t(lvl5_data@mat[,trainIDs[[N]]]),
y=as.factor(lvl5_data@cdesc[trainIDs[[N]],"pert_iname"]),
num.threads=8,
verbose=F),
simplify=F)
# ^ testing ----
rfresults <- sapply(seq_along(rfmodel),function(N)
predict(rfmodel[[N]],t(lvl5_data@mat[,testIDs[[N]]])),
simplify=F)
save(rfmodel,rfresults,trainIDs,testIDs,
file="~/Dropbox/GDB_archive/CMapCorr_files/200427_lvl5_mixall_balanced_saturated.RData")