Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New functions for ChEMBL, PubChem, and Cellosaurus. #11

Merged
merged 18 commits into from
Oct 26, 2023
10 changes: 5 additions & 5 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, jermiah]
pull_request:
branches: [main, master]

Expand Down Expand Up @@ -31,18 +31,18 @@ jobs:
steps:
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: rcmdcheck

- uses: r-lib/actions/check-r-package@v1
- uses: r-lib/actions/check-r-package@v2
with:
error-on: '"error"'
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Version: 0.0.5.9001
Authors@R: c(
person("Christopher", "Eeles", role = c("aut"),
email = "[email protected]"),
person("Jermiah", "Joseph", role = c("aut"),
email = "[email protected]"),
person("Sisira", "Nair", role = c("aut"),
email="[email protected]"),
person("Petr", "Smirnov", role=c("aut")),
Expand All @@ -30,7 +32,6 @@ Imports:
qs,
checkmate,
rlang,
CoreGx,
crayon,
memoise,
R6,
Expand Down
10 changes: 5 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(.buildURL)
export(.createQueryURLs)
export(.groupListByName)
export(addAnnotationColumnToDrugs)
export(characterToNamedVector)
Expand All @@ -13,12 +14,14 @@ export(downloadAndExtract)
export(downloadFDAOrangeBook)
export(find_remote_files_recursive)
export(fsquash)
export(getAllPubChemAnnotations)
export(getBrainArrayTable)
export(getCellApi)
export(getCellosaurus)
export(getCellosaurusAPI)
export(getCellosaurusDataFrame)
export(getCelloxml)
export(getChemblAllMechanisms)
export(getChemblMechanism)
export(getDrugTargets)
export(getFDAOrangeBookProducts)
export(getFailed)
Expand All @@ -31,7 +34,7 @@ export(getGencodeFilesTable)
export(getGencodeGRangesAnnotated)
export(getGuideToPharm)
export(getInfoFromCelllineInput)
export(getPubChemAnnotations)
export(getPubChemAnnotation)
export(getPubChemCompound)
export(getPubChemFromNSC)
export(getPubChemSubstance)
Expand All @@ -55,7 +58,6 @@ export(scrapeRemoteFTPFilesTable)
export(zenodoMetadata)
import(R6)
import(checkmate)
import(httr)
importFrom(BiocParallel,"bplog<-")
importFrom(BiocParallel,"bpprogressbar<-")
importFrom(BiocParallel,"bpworkers<-")
Expand All @@ -66,8 +68,6 @@ importFrom(BiocParallel,bpparam)
importFrom(BiocParallel,bpprogressbar)
importFrom(BiocParallel,bptry)
importFrom(BiocParallel,bpworkers)
importFrom(CoreGx,.errorMsg)
importFrom(CoreGx,.warnMsg)
importFrom(R6P,Singleton)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,mcols)
Expand Down
76 changes: 40 additions & 36 deletions R/getCellosaurus.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,54 +19,58 @@
#' @md
#' @importFrom xml2 read_xml
#' @export
getCelloxml <-
memoise::memoise(function(url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml", verbose = TRUE) {
if (verbose) {
message(paste(
"xml read started from",
url,
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
main_xml <- read_xml(url)
if (verbose) {
message(paste(
"xml read completed at",
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
return(main_xml)
})

#' @md
#' @importFrom xml2 read_xml xml_find_all
#' @export
#########ADD DOCS
cleanCellnames <-
function(main_xml, verbose = TRUE) {
getCelloxml <- memoise::memoise(function(url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml", verbose = TRUE) {
if (verbose) {
message(paste(
"Started removing special characters from cell line names in the xml",
"xml read started from",
url,
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
matching <- xml_find_all(main_xml, "//cell-line/name-list/name/text()")
# A raw string, added in R 4.0 will excape characters for you: r"{ <string> }"
badchars <- r"{[\xb5]|[]|[ ,]|[;]|[:]|[-]|[+]|[*]|[%]|[$]|[#]|[{]|[}]|[[]|[]]|[|]|[^]|[/]|[\]|[.]|[_]|[ ]|[(]|[)]}"
for(i in 1:length(matching)){
node1 <- matching[[i]]
node1text <- xml_text(node1)
xml_par <- xml_find_first(node1, "parent::*")
xml_set_attr(xml_par, "cleanname", gsub(badchars,"",ignore.case = TRUE, node1text))
}
main_xml <- read_xml(url)
if (verbose) {
message(paste(
"Removed special characters from cell line names in the xml",
"xml read completed at",
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
return(main_xml)
}
)

#' Clean Cell Names
#'
#' @description
#' TODO::
#'
#' @md
#' @importFrom xml2 read_xml xml_find_all
#' @export
#########ADD DOCS
cleanCellnames <- function(main_xml, verbose = TRUE) {
if (verbose) {
message(paste(
"Started removing special characters from cell line names in the xml",
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
matching <- xml_find_all(main_xml, "//cell-line/name-list/name/text()")
# A raw string, added in R 4.0 will excape characters for you: r"{ <string> }"
badchars <- r"{[\xb5]|[]|[ ,]|[;]|[:]|[-]|[+]|[*]|[%]|[$]|[#]|[{]|[}]|[[]|[]]|[|]|[^]|[/]|[\]|[.]|[_]|[ ]|[(]|[)]}"
for(i in 1:length(matching)){
node1 <- matching[[i]]
node1text <- xml_text(node1)
xml_par <- xml_find_first(node1, "parent::*")
xml_set_attr(xml_par, "cleanname", gsub(badchars,"",ignore.case = TRUE, node1text))
}
if (verbose) {
message(paste(
"Removed special characters from cell line names in the xml",
format(Sys.time(), "%Y-%m-%d %H:%M:%S")
))
}
return(main_xml)
}

#' Filter parent node cell-line and parse child nodes for required annotations
#' @param cell_ip is either cell name or cvcl id.
Expand Down
164 changes: 164 additions & 0 deletions R/getCellosaurusAPI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@

#' Create a list of query URLS for Cellosaurus API
#'
#' @description
#' This function creates a queryURL for the cellosaurus API using a list of cell line names
#'
#' @details
#' Function to create a URL query for Cellosaurus to search for a cell-line using its name
#' An example call: computedURLs <- .createQueryURLs(api = "https://api.cellosaurus.org/", cl_names = c("22rv1", "Hela"), fields = c("id", "ac"))
#' @return A list of URLS
#' @param api is the link to the API to build the URL. i.e "https://api.cellosaurus.org/"
#' @param cl_names is a list of the cell line names
#' @param format is the type of format to return from the API. Can be "txt" or "json"
#' @param num_results is the number of of items to return, DEFAULT=1
#' @param GETfxn is the function to use on the cellosaurus website. Currently only supports "search/cell-line?"
#' @param fields is a list of desired fields to include in the response
#'
#' @md
#' @export
.createQueryURLs <-
function(api = "https://api.cellosaurus.org/",
cl_names,
format = "txt",
num_results = 1,
GETfxn = c("search/cell-line?", "cell-line/"),
fields,
q = "idsy:") {

if (GETfxn == "search/cell-line?") {
# create urls
computedURLs <- paste0(
api,
GETfxn,
"q=", q,
gsub(" ", "%20",cl_names),
"&rows=", num_results,
"&format=", format,
"&fields=", paste(fields, collapse=",")
)
return(computedURLs)
} else if (GETfxn == "cell-line/") {
computedURLs <- paste0(
api,
GETfxn,
gsub(" ", "%20",cl_names),
"?",
"format=",format,
"&fields=", paste(fields, collapse=",")
)
return(computedURLs)
} else {
stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'")
}

return(computedURLs)
}

#' Query Cellosaurus
#'
#' @description
#' This function takes a list of cell line names and interested fields and gets responses from the Cellosaurus API
#'
#' @details
#' Function to get responses from Cellosaurus API
#'
#' @return A list of responses
#' @param cl_names is a list of the cell line names
#' @param fields is a list of desired fields to obtain for each cell line in the API query, i.e if only trying to get synonynms and primary accesssion then fields=c("sy", "ac"). see https://api.cellosaurus.org/static/fields_help.html for all fields.
#'
#' @md
#' @export
#'
getCellosaurusAPI <-
function(
cl_names, # List of cell line names
fields = c(
"id", # Recommended name. Most frequently the name of the cell line as provided in the original publication.
"ac", # Primary accession. It is the unique identifier of the cell line. It is normally stable across Cellosaurus versions ...
"sy", # List of synonyms.
"misspelling", # Identified misspelling(s) of the cell line name
"din", # Disease(s) suffered by the individual from which the cell line originated with its NCI Thesaurus or ORDO identifier.
"ca", # Category to which a cell line belongs, one of 14 defined terms. Example: cancer cell line, hybridoma, transformed cell line.
"sx", # Sex
"ag", # Age at sampling time of the individual from which the cell line was established.
"sampling-site", # Body part, organ, cell-type the cell line is derived from
"metastatic-site" # Body part, organ the cell line is derived from in the case of a cell line originating from a cancer metastasis.
),
GETfxn = c("search/cell-line?", "cell-line/"), # Function to use on the cellosaurus website
querydomain = "ac:"
){
cellosaurus_api_url <- "https://api.cellosaurus.org/"

computedURLs <- .createQueryURLs(api = cellosaurus_api_url, GETfxn = GETfxn, cl_names = cl_names, fields = fields, q = querydomain)

responseList <- BiocParallel::bplapply(computedURLs, function(x) GET(x))
names(responseList) <- cl_names
return(responseList)
}

#' Clean cellosaurus responses
#'
#' @description
#' This function takes a list of Cellosaurus Responses and cleans them for use
#'
#' @details This function takes a list of Cellosaurus Responses and cleans them for use
#' @return A list of responses
#' @param responseList is a list of responses
#'
#' @md
#' @export
#'
cleanCellosaurusResponse <-
function(
responseList,
GETfxn = c("search/cell-line?", "cell-line/")
){
# Get content of each response, then separate content on newline character
responseContent <- lapply(lapply(responseList, httr::content),
function(x) strsplit(x=x, split="\n"))

if (GETfxn == "search/cell-line?") {
#Remove first 15 rows of content
responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)])
# Split on first " " appearance
responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x, split = " ."))
# rbind responses (do.call returns as one large matrix instead of dataframe)
df_ <- lapply(responseContent_sub_split, function(x) do.call(rbind, x))

# convert each response from matrix to data.table
df_2 <- lapply(df_, function(x) data.table(x))

# rbinds all responses (creates new column so all of cell line x will have its name in col cellLine)
df_3<- rbindlist(df_2, idcol = "cellLine")
df_3 <- df_3[V1!="//"]

# Collapse all rows with the same cellLine & V1 (most often rows for cc) and separate by "; "
df_4 <- df_3[, list(data = paste0(unique(na.omit(V2)), collapse ="; ")), by = c("cellLine", "V1")]

# transpose
df_5 <- data.table::dcast(df_4, cellLine ~ ...)

return(df_5)
} else if (GETfxn == "cell-line/") {
#Remove first 15 rows of content
# responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)])
responseContent_sub <- responseContent
responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x[[1]], split = " ."))

result <- rbindlist(lapply(responseContent_sub_split, function(x) {
# remove the entire column if any of the elements has "//" in it
if (any(grepl("//", x))) {
x <- x[, -1]
}

cvcl_dt <- as.data.table(x)
names(cvcl_dt) <- as.character(cvcl_dt[1])
cvcl_dt[2]
}))
return(result)

} else {
stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'")
}
}
Loading
Loading