-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgsea_pro_v3.R
230 lines (181 loc) · 9.32 KB
/
gsea_pro_v3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Gene Set Enrichment Analysis for Prokaryotes
# FUNAGE_Pro
#
# Anne de Jong
# University of Groningen
#
# April 19, 2022 Name changed to FUNAGE-Pro
# using phyper(q, m, n, k, lower.tail = TRUE, log.p = FALSE) from package {stat}
# load parameters from command line or call from the webserver
# R command line: R --vanilla --slave --args . robyn -r bacillus_cereus_merged_normalized.rpkm Factors.txt Contrasts.txt < /usr/molgentools/rnaseq/RNAseq_multifactor.R
sessiondir <- commandArgs()[5]
gene_list <- commandArgs()[6]
genome <- commandArgs()[7]
user_prefix <- commandArgs()[8]
cluster_id <- commandArgs()[9]
# Custom settings / or parameters from webserver
# for local testing:
# sessiondir = "E:\\Google Drive\\WERK\\GSEA_Pro\\dataset"
# gene_list = "gsea_pro_locus_list.txt"
# user_prefix = "my_results"
# genome <- 'ASM904v1_genomic.g2d'
# cluster_id <- '1'
# test FACoP
# sudo -u www-data R --vanilla --slave --args /tmp/GSEAPRO/GSEA_PRO/129.125.142.95.4sgv17tddtls9lbcchn2rmh264.883 /tmp/GSEAPRO/GSEA_PRO/129.125.142.95.4sgv17tddtls9lbcchn2rmh264.883/GSEA_Pro.list.single_list.txt /tmp/FACoP/FACoP/129.125.142.95.ncm44vdlmjc3q09s4bk4ee52i1.254/query.FACoP GSEA_Pro single_list < /data/gsea_pro/gsea_pro_v3.R
# preset values
classes <- list('COG','eggNOG_COG','GO','IPR', 'KEGG', 'KEYWORDS', 'Pfam', 'operons','REGULON')
class_colors <- list(IPR='#4D4D4D',GO='#5DA5DA',KEGG='#FAA43A', Pfam='#60BD68', SUPERFAMILY='#F17CB0', eggNOG_COG='#B2912F', KEYWORDS='#B276B2', COG='#B276B2', ENOG='#B276B2', operons='#B276B2', REGULON='#B276B2')
FACoP_classes <- list('COG','eggNOG_COG','GO','IPR', 'KEGG', 'KEYWORDS', 'Pfam')
# for FACoP classes the genome names is genome.FACoP and will be applied when needed
#genome = paste(genome,'.FACoP',sep="")
setwd(sessiondir)
# ------------------------------------------------------- functions ----------------------------------------------------------------------------
is.not.null <- function(x) ! is.null(x)
HypGeomDist_old <- function(k,m,n,N) {
# Hypergeometrical distribution -----------------------------------------------------------------
# It describes the chance when getting a certain amount of IDs (n) without
# replacement from the total possible ID's (N), hitting the number of CLASS's
# frequency(k) in the TopTopHitsClass when the total CLASS frequency is (m)
# It uses the summation of the hypergeometrical distribution
# It calculates for each CLASS, what is the chance to get the number
# of 'k' TopHitsClass in the TopTopHitsClass or even more extreme when drawing randomly
# Summation of hypergeometrical distribution
# e.g.,
# k <- as.numeric(list(8, 2, 4, 7, 6, 2, 5, 2, 4, 2, 2, 2, 2, 2, 3, 3))
# m <- as.numeric(list( 9, 5, 56, 166, 249, 217, 107, 26, 160, 30, 13, 206, 99, 33, 18, 168))
# n <- 16
# N <- 1748
pvalues <- numeric(length(k))
ctk <- choose(N,n)
if ( is.infinite(ctk) ) {
# print ("Infinite number for ctk")
for (i in 1:length(k)){
pvalues[i]=phyper(k[i], m[i], N-m[i], n)
}
} else {
# print (ctk)
for (i in 1:length(k)){
pvalues[i]=0
for (j in k[i]:m[i]){
pvalues[i]=pvalues[i]+choose(m[i],j)*choose(N-m[i],n-j)/ctk}
}
}
return(pvalues)
}
HypGeomDist <- function(k,m,n,N) {
ctk <- choose(N,n)
pvalues <- numeric(length(k))
if ( is.infinite(ctk) ) {
i <- list(1:length(k))
pvalues <- lapply(i, function(x) { phyper(k[x], m[x], N-m[x], n) })
} else {
for (i in 1:length(k)){
j <- list( k[i]:m[i] )
pvalues[i] <- sapply(j, function(x) { sum(choose(m[i],x)*choose(N-m[i],n-x)/ctk) })
}
}
return(pvalues)
}
# -------------------------------------------------------------- main ---------------------------------------------------------------------------------------------------
# 1. Read gene list
GeneList<-read.table(gene_list, sep = "\t", quote = "", blank.lines.skip = TRUE, header=FALSE)
colnames(GeneList)[1] <- "ID"
#head(GeneList)
# 2. Analyses of all classes
print(paste('GENOME in R ==> ',genome))
results <- lapply(classes, function(x) {
# for testing: x<- 'GO' or x<- 'KEGG'
if (x %in% FACoP_classes) {
filename = paste(genome,'g2d.FACoP',x,sep='.')
} else {
filename = paste(genome,'g2d',x,sep='.')
}
filenameout = paste(user_prefix,"TopHitsClass",x,cluster_id,'txt',sep='.')
if (file.exists(filename)) {
# a. Read the classes Class data from files
CLASS_table<-read.table(filename, sep = "\t", quote = "", blank.lines.skip = TRUE, header=FALSE)
colnames(CLASS_table) <- c("ID","CLASS_ID","Description")
# head(CLASS_table)
# b. Add class data to the genes and write annotation to files
Annotation <- CLASS_table
Annotation$ID <- NULL
Annotation <- Annotation[order(Annotation$CLASS_ID),]
Annotation <- Annotation[!duplicated(Annotation[,1]),]
# head(Annotation)
TopHitsClass<-merge(CLASS_table, GeneList, by="ID")
write.table(TopHitsClass, file=filenameout, sep="\t", row.names=F, quote=FALSE)
# c. Get frequencies
# Frequencies of CLASS members in GeneList ==> k
ClassFrequency_TopHits<-as.data.frame(with(TopHitsClass[!duplicated(TopHitsClass), ], table(CLASS_ID)))
# Frequencies of CLASS members ==> m
ClassFrequency<-as.data.frame(with(CLASS_table[!duplicated(CLASS_table), ], table(CLASS_ID)))
# Combine the Frequency tables
CombinedClassCount<-cbind(ClassFrequency_TopHits,ClassFrequency$Freq)
colnames(CombinedClassCount)=c("CLASS_ID","Hits","ClassSize")
# remove Class members that are not present or only 1x in GeneList
CombinedClassCount<-CombinedClassCount[CombinedClassCount$Hits>1,]
# remove Class members that only occur 1x in the total class
CombinedClassCount<-CombinedClassCount[CombinedClassCount$ClassSize>1,]
#Checking how many ID's are left after merging them with CLASS's ==> N
foundg<-as.data.frame(with(TopHitsClass[!duplicated(TopHitsClass), ],table(ID)))
# d. Preparing variables for HypGeomDist
k <- CombinedClassCount$Hits
m <- CombinedClassCount$ClassSize
n <- nrow(CombinedClassCount)
N <- nrow(foundg)
if ( (k!=0) && (m!=0) && (n!=0) ) {
pvalues <- HypGeomDist(k,m,n,N)
adj_pvalues<-p.adjust(pvalues,method="BH",length(k))
result_table<-cbind(CombinedClassCount,pvalues,adj_pvalues)
result_table<-merge(result_table, Annotation, by="CLASS_ID")
result_table<-result_table[order(result_table$pvalues),]
result_table<-result_table[result_table$adj_pvalues<=0.05,]
result_table<-format(result_table, digits=2, nsmall=2)
x <- result_table
}
}
})
names(results) <- classes
names(results)
# 3. Add colors and export table for each class
result_tables <- NULL
result_tables <- lapply(names(results), function(x) {
if (is.not.null(results[[x]])) {
result_table <- unique(as.data.frame(results[x]))
colnames(result_table) <- colnames(results[[1]])
result_table$class <- x
result_table$color <- as.character(class_colors[x])
result_table$Ratio <- format( round(as.numeric(result_table$Hits) / as.numeric(result_table$ClassSize),2), nsmall=2 )
result_table$minFDR <- -log(as.numeric(result_table$adj_pvalues)+1e-12, 10)
result_table$log2ClassSize <- log(as.numeric(result_table$ClassSize), 2)
if (nrow(result_table)>0) {
filename = paste(user_prefix,'GSEA',x, cluster_id,'txt',sep='.')
write.table(result_table, file=filename, sep="\t", row.names=F, quote=FALSE)
}
result_table
}
})
names(result_tables) <- names(results)
head(result_tables)
# Only take the tophits after sorting on ratio
my_tophits <- 3
result_tables_tophits <- NULL
result_tables_tophits <- lapply(names(result_tables), function(x) {
# testing: x <- 'IPR'
result <- as.data.frame(result_tables[[x]])
head(result[order(as.numeric(result$Ratio), decreasing = TRUE), ], my_tophits)
})
names(result_tables_tophits) <- names(result_tables)
#head(result_tables_tophits)
# 4. Combine and save results
merged_table <- do.call("rbind", result_tables)
merged_table$cluster_id <- cluster_id
merged_table_tophits <- do.call("rbind", result_tables_tophits)
#merged_table_tophits$cluster_id <- cluster_id
# lower p-values then 1e-12 is always significant and for plotting purposes we add a plateau here
#merged_table$minFDR <- -log(as.numeric(merged_table$adj_pvalues)+1e-12, 10)
#merged_table$log2ClassSize <- log(as.numeric(merged_table$ClassSize), 2)
filename = paste(user_prefix,'GSEA_merged', cluster_id,'txt',sep='.')
write.table(merged_table, file=filename, sep="\t", row.names=F, quote=FALSE)
filename = paste(user_prefix,'GSEA_merged_tophits', cluster_id, 'txt',sep='.')
write.table(merged_table_tophits, file=filename, sep="\t", row.names=F, quote=FALSE)