-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgsea_pro_benchmark.R
194 lines (154 loc) · 7.49 KB
/
gsea_pro_benchmark.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Gene Set Enrichment Analysis for Prokaryotes
# GSEA_Pro
#
# Anne de Jong
# University of Groningen
#
# Januari 2017, Benchmark GSEA
# --------------------------------------------------------------------- parameters ---------------------------------------------------------------
# load parameters from command line or call from the webserver
sessiondir <- commandArgs()[5]
gene_list <- commandArgs()[6]
genome <- commandArgs()[7]
experiment <- commandArgs()[8]
# for local use:
#
# sessiondir = "E:\\Google Drive\\WERK\\GSEA_Pro"
# sessiondir = "C:\\Users\\Anne\\Google Drive\\WERK\\GSEA_Pro"
# gene_list = "genelist.txt"
# genome <- 'ASM904v1_genomic.g2d'
# experiment <- 'A_F71Y-WT'
# maxTopHits <- 200
maxTopHits <- 200
noiseValue <- 1.1
# this can be change to e.g. IPR, GO, KEGG
benchmark_class <- 'GO'
setwd(sessiondir)
# ------------------------------------------------------- functions ----------------------------------------------------------------------------
is.not.null <- function(x) ! is.null(x)
HypGeomDist <- function(k,m,n,N) {
ctk <- choose(N,n)
pvalues <- numeric(length(k))
if ( is.infinite(ctk) ) {
i <- list(1:length(k))
pvalues <- lapply(i, function(x) { phyper(k[x], m[x], N-m[x], n) })
} else {
for (i in 1:length(k)){
j <- list( k[i]:m[i] )
pvalues[i] <- sapply(j, function(x) { sum(choose(m[i],x)*choose(N-m[i],n-x)/ctk) })
}
}
return(pvalues)
}
load_class_data <- function(Class) {
# Read the classes Class data from files
# for testing: Class<- 'GO'
filename = paste(genome,'g2d',Class,sep='.')
if (file.exists(filename)) {
CLASS_table<-read.table(filename, sep = "\t", quote = "", blank.lines.skip = TRUE, header=FALSE)
colnames(CLASS_table) <- c("ID","CLASS_ID","Description")
}
return(CLASS_table)
}
Test_Class <- function(GeneSubList, CLASS_table) {
# for testing: GeneSubList<- my_gene_subset
TopHitsClass<-merge(CLASS_table, GeneSubList, by="ID")
# b. Get frequencies
# Frequencies of CLASS members in GeneSubList ==> k
ClassFrequency_TopHits<-as.data.frame(with(TopHitsClass[!duplicated(TopHitsClass), ], table(CLASS_ID)))
# Frequencies of CLASS members ==> m
ClassFrequency<-as.data.frame(with(CLASS_table[!duplicated(CLASS_table), ], table(CLASS_ID)))
# Combine the Frequency tables
CombinedClassCount<-cbind(ClassFrequency_TopHits,ClassFrequency$Freq)
colnames(CombinedClassCount)=c("CLASS_ID","Hits","ClassSize")
# remove Class members that are not present or only 1x in GeneSubList
CombinedClassCount<-CombinedClassCount[CombinedClassCount$Hits>1,]
# remove Class members that only occur 1x in the total class
CombinedClassCount<-CombinedClassCount[CombinedClassCount$ClassSize>1,]
#Checking how many ID's are left after merging them with CLASS's ==> N
foundg<-as.data.frame(with(TopHitsClass[!duplicated(TopHitsClass), ],table(ID)))
# c. Preparing variables for the formula to calculate pvalues
k <- CombinedClassCount$Hits
m <- CombinedClassCount$ClassSize
n <- nrow(CombinedClassCount)
N <- nrow(foundg)
sum_of_pvalues <- 1
adj_pvalues = 1 ;
if ( (k!=0) && (m!=0) && (n!=0) ) {
pvalues <- HypGeomDist(k,m,n,N)
adj_pvalues <- p.adjust(pvalues,method="BH",length(k))
}
return(sum(adj_pvalues < 0.01))
}
# -------------------------------------------------------------- main --------------------------------------------------------------------------------
# 1. Read gene list
GeneList<-read.table(gene_list, sep = "\t", quote = "", blank.lines.skip = TRUE, header=TRUE)
colnames(GeneList) <- c("ID", "Value")
GenesUp <- GeneList[order(GeneList$Value, decreasing = TRUE),]
GenesUp <- GenesUp[ which(GenesUp$Value > noiseValue),]
if (nrow(GenesUp) > maxTopHits) { GenesUp <- GenesUp[1:maxTopHits,] }
GenesDown <- GeneList[order(GeneList$Value, decreasing = FALSE),]
GenesDown <- GenesDown[ which(GenesDown$Value < -noiseValue),]
if (nrow(GenesDown) > maxTopHits) { GenesDown <- GenesDown[1:maxTopHits,] }
#head(GenesUp)
#head(GenesDown)
# 2. Load the Class data from genome classification file
CLASS_table <- load_class_data(benchmark_class)
#head(CLASS_table)
# 3. Test the class for max score
results <- NULL
# UP
my_list <- seq(3, nrow(GenesUp),2) # Start with minimum of 3 genes until all genes, with step 2
my_scores <- lapply(my_list, function(y) {
#y <- 149
my_gene_subset <- lapply(as.data.frame(GenesUp$ID), function(x) {x[seq(1,y,1)]})
names(my_gene_subset) = "ID"
x <- Test_Class(my_gene_subset, CLASS_table ) / (7+y)
})
my_scores_table <- as.data.frame(unlist(my_list))
my_scores_table$scores = unlist(my_scores)
colnames(my_scores_table) <- c("cutoff", "scores")
GenesUp$cutoff <- seq_len(nrow(GenesUp)) # add the rownumber as cutoff column to be able to merge the results
my_scores_table <- merge(my_scores_table, GenesUp, by = "cutoff", , incomparables = NA)
filename = paste('00.Scores_up',experiment,'txt',sep='.')
write.table(my_scores_table, file=filename, sep="\t", row.names=F, col.names=TRUE, quote=FALSE)
my_max <- my_scores_table[my_scores_table$score==max(my_scores_table$score),] # get the row with max score
my_max <- my_max[my_max$Value==max(my_max$Value),] # get the row with highest Value
results$UP.max <- my_max$score
results$UP.max_index <- my_max$cutoff
results$UP.cutoff <- round(my_max$Value, digits=2)
my_scores_table_up_down <- my_scores_table
# DOWN
my_list <- seq(3, nrow(GenesDown),2) # Start with minimum of 3 genes until all genes, with step 2
my_scores <- lapply(my_list, function(y) {
#y <- 149
my_gene_subset <- lapply(as.data.frame(GenesDown$ID), function(x) {x[seq(1,y,1)]})
names(my_gene_subset) = "ID"
x <- Test_Class(my_gene_subset, CLASS_table ) / (7+y)
})
my_scores_table <- as.data.frame(unlist(my_list))
my_scores_table$scores = unlist(my_scores)
colnames(my_scores_table) <- c("cutoff", "scores")
GenesDown$cutoff <- as.numeric(seq(nrow(GenesDown),1,-1)) # add the rownumber as cutoff column to be able to merge the results
GenesDown$cutoff <- 1+ abs(GenesDown$cutoff - max(GenesDown$cutoff)) # reverse order of numbers
my_scores_table <- merge(my_scores_table, GenesDown, by = "cutoff", , incomparables = NA)
filename = paste('00.Scores_down',experiment,'txt',sep='.')
write.table(my_scores_table, file=filename, sep="\t", row.names=F, col.names=TRUE, quote=FALSE)
my_max <- my_scores_table[my_scores_table$score==max(my_scores_table$score),] # get the row with max score
my_max <- my_max[my_max$Value==min(my_max$Value),] # get the row with highest Value
results$DOWN.max <- my_max$score
results$DOWN.max_index <- my_max$cutoff
results$DOWN.cutoff <- round(my_max$Value, digits=2)
# Merge and sort UP and DOWN and write table
my_scores_table_up_down <- rbind(my_scores_table_up_down, my_scores_table)
my_scores_table_up_down <- my_scores_table_up_down[order(my_scores_table_up_down$Value, decreasing = TRUE),]
#my_scores_table_up_down$row <- 1:nrow(my_scores_table_up_down)
my_scores_table_up_down$cutoff <- 1:nrow(my_scores_table_up_down)
# head(my_scores_table_up_down)
filename = paste('00.Scores_up_down',experiment,'txt',sep='.')
write.table(my_scores_table_up_down, file=filename, sep="\t", row.names=F, col.names=TRUE, quote=FALSE)
# 4. Write the results
results
filename = paste('00.GSEA_benchmark',experiment,'txt',sep='.')
write.table(t(as.data.frame(results)), file=filename, sep="\t", row.names=T, col.names=FALSE, quote=FALSE)
write.table(t(as.data.frame(results)), file='00.GSEA_benchmark.txt', sep="\t", row.names=T, col.names=FALSE, quote=FALSE)