-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGOFilter.R
53 lines (44 loc) · 2.53 KB
/
GOFilter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
GOFilter <- function(ExcelDataFilePath, GOVector, godir = "~/Desktop/RLibrary/godir-20240617", GOTermColumnName = "GO_id", ReAdjustPValues = TRUE, PValueColumnName = "pvalue", AdjustedPValueColumnName = "padj") {
suppressPackageStartupMessages(library("readxl"))
# Filtering
Data <- read_excel(ExcelDataFilePath)
ChildNodeIDs <- get_unique_child_nodes_batch(GOVector, godir = godir)
isRetained = logical(nrow(Data)) # initialize a vector of FALSEs
for (i in 1 : nrow(Data)) {
if (length(intersect(convert_string_to_go_vector(Data[i, GOTermColumnName]), ChildNodeIDs)) != 0) {
isRetained[i] <- TRUE
}
}
FilteredData <- Data[isRetained,]
# Readjust p-values if required
if (ReAdjustPValues) {
# Sort the table by p-values: unnecessary if the Excel data file was generated by the Flaski RNAseq pipeline (because the genes were sorted by their p-values already)
# Re-adjust p-values using the Benjamini--Hochberg correction (do not adjust p = NA; return 1 if the calculated p-adj > 1; preserve the order)
# Method 1:
# for (i in nrow(FilteredData) : 1) {
# FilteredData[i, AdjustedPValueColumnName] <- min(FilteredData[i, PValueColumnName] * sum(!is.na(FilteredData[, PValueColumnName])) / i, 1)
# if (!is.na(FilteredData[i + 1, AdjustedPValueColumnName]) &
# (FilteredData[i + 1, AdjustedPValueColumnName] < FilteredData[i, AdjustedPValueColumnName])) {
# FilteredData[i, AdjustedPValueColumnName] <- FilteredData[i + 1, AdjustedPValueColumnName]
# }
# }
# Method 2 (using the built-in function p.adjust()):
FilteredData[, AdjustedPValueColumnName] <- p.adjust(unlist(FilteredData[, PValueColumnName]), method = "fdr")
}
return(FilteredData)
}
convert_string_to_go_vector <- function(string, pattern = "; ") {
suppressPackageStartupMessages(library("stringr"))
Vector <- unlist(str_split(string, pattern))
# Remove empty elements
return(Vector[nzchar(Vector)])
}
# To generate the latest godir, See: https://bioconductor.org/packages/release/bioc/vignettes/GOfuncR/inst/doc/GOfuncR.html#conversion-from-.obo-format
# Python scripts to convert https://current.geneontology.org/ontology/go-basic.obo into a godir: https://github.com/sgrote/OboToTerm
get_unique_child_nodes_batch <- function(GOVector, godir) {
suppressPackageStartupMessages(library("GOfuncR"))
# A GO-ID itself is also considered as a child with a distance of 0
ChildNodes <- get_child_nodes(GOVector, godir = godir)
ChildNodeIDs <- unique(ChildNodes$child_go_id)
return(ChildNodeIDs)
}