-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHW_Categorical_Paranclitic.R
102 lines (86 loc) · 4.71 KB
/
HW_Categorical_Paranclitic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
source("Functions/functions.R")
#Variables to set and data
{ #Braces to collapse code section
file_location <- "Data/" #Path to data files.
result_folder <- "20170615_Density/" #Where to store output files. This folder will be created if it doesn't already exist.
file_name <- "Synthetic_WeightBaseCaseControl.csv"
first_columns <- 4 #column number of first parameter -1
number_of_continuous <- 30 #Number of continuous data columns. These should be after the sample meta data in your .csv file.
number_of_categoricals <- 9 #Number of categorical variables. These should be the final columns in your .csv file.
column_of_case_control <- 1 #Column that cases/controls are defined in.
case_number <- 1 #code for test cases.
control_number <- 0 #code for control cases
base_number <- 2 #code for network controls. These are used to building the underlying density distributions.
auc_case <- 3 #Cases for making the weights. This should be base_number +1. NA = not using
number_of_indexes <- 17 #Parenclitic function contains 17 indicies. This should be left at 17 unless the function "parenclitic" is edited.
bestmarker <- c("X1","X2","X3") #names of top 3 single markers
Best_col <- 5 #Best marker
Second_col <- 6 #second best marker
Third_col <- 7 #third best marker
grid_size <- 16 #size of the density matrix grid (e.g. 10 = 10x10 grid)
IndicesInModel <- 1 #the number of indices to include in the model
spectoset <- 0.95 #Set the specificity for logistic regression testing.
TotalConnectionsPlot <- 1 #Plot graphs of total connections for each threshold (1=yes)
#Threshold settings. These are required for "Program" section below.
st=0.5 #Starting threshold
d=0.05 #Increment. If scanning over multiple thresholds.
loops=1 #Number of iterations. If performing analysis at one threshold (st), set this to 1.
#Read in the data.
data=read.csv(paste(file_location,file_name,sep=""),na.strings=c("NA","#VALUE!",""))
#####FOR EXAMPLE DATA#####
data <- data[,-35] #Removing a categorical that was artificially set to 100% accuracy.
data <- data[which(data$V3==9),] #Use only samples closest to diagnosis.
##########################
#Example code for imputing missing values using Amelia algorithm.
# #Impute Missing Values
# set.seed=123
#
# data_imp <- amelia(data[,(first_columns+1):ncol(data)],m=5)
# data_imp <- (data_imp$imputations$imp1+data_imp$imputations$imp2+
# data_imp$imputations$imp3 +
# data_imp$imputations$imp4 +
# data_imp$imputations$imp5)/5
# data[,(first_columns+1):ncol(data)] <- data_imp
}
#Program
{#This loops over thresholds (st) loops number of times, calling "parenclitic" each time.
#The threshold is increased by an increment "d" each time.
#The best model for each threshold, the AUC, Spec and Sens are returned and written in "aucii.csv".
#If the file "aucii.csv" is not written, the data is stored in the global environment in the matrix aucii.
aucii=matrix(0,loops,5)
for(aucii_index in 1:loops){
threshold = (aucii_index-1)*d+st;
List_of_results <- parenclitic(data,result_folder,first_columns,number_of_continuous,column_of_case_control,case_number,control_number,base_number,threshold,number_of_categoricals,number_of_indexes,Best_col,Second_col,grid_size,IndicesInModel,spectoset,TotalConnectionsPlot,Third_col,auc_case,bestmarker)
aucii[aucii_index,1]=List_of_results$thres
aucii[aucii_index,2]=List_of_results$auc
aucii[aucii_index,3]=List_of_results$specificity
aucii[aucii_index,4]=List_of_results$sensitivity
aucii[aucii_index,5]=List_of_results$formula
}
colnames(aucii) <- c("thres","auc","specificity","sensitivity","formula")
if(file.exists(paste0(result_folder,"aucii.csv"))==T){
temp <- as.matrix(read.csv(paste0(result_folder,"aucii.csv")))
aucii <- rbind(temp,aucii)
}
write.csv(aucii,file=paste(result_folder,"aucii.csv",sep=""),row.names=F) #Contains results.
}
#AUC and Sensitivity Graph - if a number of threshold have been tried, this generates a graph for visual selection of the optimal threshold.
{
data_thres_auc <- read.csv(paste(result_folder,"aucii.csv",sep=""))
sens <- ggplot(data_thres_auc, aes(x=thres, y=sensitivity)) +
geom_point(shape=1, size=2, colour="red") +
xlab("Threshold") +
ylab("Sensitivity") +
ylim(c(0,1)) +
ggtitle("Sensitivity") +
theme(title=element_text(size=8))
auc <- ggplot(data_thres_auc, aes(x=thres, y=auc)) +
geom_point(shape=1, size=2, colour="red") +
xlab("Threshold") +
ylab("AUC") +
ylim(c(0,1)) +
ggtitle("AUC") +
theme(title=element_text(size=8))
plot_grid(sens,auc)
ggsave(paste(result_folder,"Sens_AUC.png",sep=""),width=8,height=3)
}