Merge pull request #47 from bhklab/development

Development
bhklab · Mar 27, 2024 · 1014ee1 · 1014ee1
2 parents e645297 + f09eac0
commit 1014ee1
Show file tree

Hide file tree

Showing 43 changed files with 1,075 additions and 423 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,8 @@ docs
 /Meta/
 .lintr
 covr
-TRASH
+TRASH
+Treatment-Annotation*.Rmd
+
+./*.csv
+CCLE_treatmentMetadata.csv
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: AnnotationGx
 Title: AnnotationGx: A package for building, updating and querying an
     annotation database for pharmaco-genomic data
-Version: 0.0.0.9090
+Version: 0.0.0.9095
 Authors@R: c(
     person("Jermiah", "Joseph", role = c("aut", "cre"),
         email = "[email protected]"),
@@ -22,17 +22,15 @@ Imports:
     crayon,
     httr2,
     data.table,
-    options,
-    BiocParallel,
     readr,
-    xml2
+    xml2,
+    memoise
 Suggests: 
     testthat (>= 3.0.0),
     covr,
     readxl,
     knitr,
     rmarkdown,
-    BiocStyle,
     RefManageR,
     sessioninfo
 Config/testthat/edition: 3

diff --git a/Dockerfile b/Dockerfile
@@ -10,9 +10,9 @@ COPY . /app
 WORKDIR /app
 
 # RUN R -e 'install.packages(c("BiocManager", "devtools", "jsonlite", "qpdf"), repos=c("https://cloud.r-project.org/", "https://cran.rstudio.com/"))'
-# RUN R -e 'BiocManager::install("BiocParallel")'
 RUN R -e 'install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/stable/%s/%s/%s", .Platform$pkgType, R.Version()$os, R.Version()$arch))'
 RUN R -e 'pak::pkg_install(".", dependencies=TRUE, upgrade=TRUE, ask = FALSE)'
+RUN R -e 'pak::cache_clean(); pak::meta_clean(force = TRUE)'
 
 # RUN install2.r --error --deps TRUE \
 #     qpdf \

diff --git a/NAMESPACE b/NAMESPACE
@@ -29,7 +29,6 @@ export(standardize_names)
 export(strSplit)
 export(unlistNested)
 exportMethods(matchNested)
-import(BiocParallel)
 importFrom(checkmate,assert)
 importFrom(checkmate,assert_atomic)
 importFrom(checkmate,assert_choice)

diff --git a/R/data.R → R/AnnotationGx-data.R b/R/data.R → R/AnnotationGx-data.R
@@ -1,9 +1,8 @@
 #' gdsc_sampleMetadata is some preprocessed sample metadata from the GDSC dataset
 #'
-#' A preprocessed version of the sample metadata from the GDSC dataset. This dataset
-#' contains the following columns: GDSC.Sample_Name, GDSC.BROAD_ID, GDSC.RRID, GDSC.COSMIC_ID, and CCLE.sampleid.
-#' This dataset is used in the AnnotationGx package to map cell line names from various sources to the
-#' Cellosaurus database.
+#' A preprocessed version of the sample metadata from the GDSC dataset. 
+#' This dataset is provided in the package to test the functionality of the package.
+#' The original dataset can be downloaded from the CancerRxGene website.
 #'
 #' @format A data table with 5 columns and 1001 rows.
 #' \describe{
@@ -38,10 +37,10 @@
 #' 
 "CTRP_treatmentMetadata"
 
-#' gCSI_sampleMetadata is some preprocessed sample metadata from the NCI60 dataset
+#' gCSI_sampleMetadata is some preprocessed sample metadata from the gCSI dataset
 #' 
 "gCSI_sampleMetadata"
 
-#' gCSI_treatmentMetadata is some preprocessed treatment metadata from the NCI60 dataset
+#' gCSI_treatmentMetadata is some preprocessed treatment metadata from the gCSI dataset
 #' 
 "gCSI_treatmentMetadata"
diff --git a/R/cellosaurus.R b/R/cellosaurus.R
@@ -125,9 +125,9 @@ mapCell2Accession <- function(
       result$query <- name
       return(result)
     }
-    response_dt <- parse_cellosaurus_text(resp, name, parsed, keep_duplicates)
+    response_dt <- .parse_cellosaurus_text(resp, name, parsed, keep_duplicates)
     response_dt
-    }) 
+  }) 
 
 
   responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE)
@@ -168,20 +168,23 @@ mapCell2Accession <- function(
 #' 
 #' @noRd 
 #' @keywords internal
-parse_cellosaurus_text <- function(resp, name, parsed = FALSE, keep_duplicates = FALSE){
+.parse_cellosaurus_text <- function(resp, name, parsed = FALSE, keep_duplicates = FALSE){
 
   responses_dt <- lapply(
       X = resp,
       FUN = .processEntry
   ) 
-
-  responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE)
+  tryCatch({
+    responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE)
+  }, error = function(e) {
+    .err(paste0("Error parsing response for ", name, ": ", e$message))
+  }) 
 
   responses_dt <- .formatSynonyms(responses_dt)
 
   if(!parsed) {
     responses_dt$query <- name
-    return(responses_dt)
+    return(responses_dt[, c("cellLineName", "accession", "query")])
   }
 
 

diff --git a/R/options.R b/R/options.R
diff --git a/R/pubchem_status.R b/R/pubchem_status.R
@@ -21,10 +21,20 @@ getPubchemStatus <- function(
   funContext <- .funContext("getPubchemStatus")
 
   request <- .buildURL(url) |> .build_pubchem_request()
-  response <- httr2::req_perform(request)
 
-  status_code <- httr2::resp_status(response)
-  message <- response$headers[["X-Throttling-Control"]]
+  # need to do NULL while loop bc sometimes X-Throttling-Control is not in the response
+  message <- NULL
+
+  while(is.null(message)) {
+    response <- httr2::req_perform(request)
+
+    if (httr2::resp_status(response) == 200) {
+      message <- response$headers[["X-Throttling-Control"]]
+    } else {
+      .warn("Request failed. Retrying...")
+      Sys.sleep(1)
+    }
+  }
   parsed_info <- .checkThrottlingStatus2(message, printMessage)
   if (returnMessage) {
     return(parsed_info)

diff --git a/R/pubchem_view.R b/R/pubchem_view.R
@@ -16,8 +16,8 @@
 getPubchemAnnotationHeadings <- function(
     type = "all", heading = NULL) {
   funContext <- .funContext("getPubchemAnnotationHeadings")
-  .debug(funContext, " type: ", type, " heading: ", heading)
 
+  .debug(funContext, " type: ", type, " heading: ", heading)
   # TODO:: messy...
   checkmate::assert(
     checkmate::test_choice(
@@ -56,6 +56,7 @@ getPubchemAnnotationHeadings <- function(
 #' @param parse_function A custom parsing function to process the response. Default is the identity function.
 #' @param query_only Logical indicating whether to return the query URL only. Default is FALSE.
 #' @param raw Logical indicating whether to return the raw response. Default is FALSE.
+#' @param nParallel The number of parallel processes to use. Default is 1.
 #'
 #' @return The annotated information about the PubChem compound.
 #'
@@ -66,29 +67,36 @@ getPubchemAnnotationHeadings <- function(
 #' @export
 annotatePubchemCompound <- function(
     cids, heading = "ChEMBL ID", source = NULL, parse_function = identity,
-    query_only = FALSE, raw = FALSE) {
+    query_only = FALSE, raw = FALSE, nParallel = 1
+  ) {
   funContext <- .funContext("annotatePubchemCompound")
 
+  .info(funContext, sprintf("Building requests for %s CIDs", length(cids)))
   requests <- lapply(cids, function(cid) {
     .build_pubchem_view_query(
       id = cid, record = "compound", heading = heading,
       output = "JSON", source = source
-    )
-  })
+      )
+   }
+  )
 
-  .debug(funContext, paste0("query:", sapply(requests, `[[`, i = "url")))
+  .debug(funContext, paste0("query: ", sapply(requests, `[[`, i = "url")))
+  if (query_only) return(requests)
 
-  if (query_only) {
-    return(requests)
-  }
+  tryCatch({
+    resp_raw <- httr2::req_perform_sequential(
+      reqs = requests, 
+      on_error = "continue",
+      progress = "Performing API requests..."
+  )}, error = function(e) {
+    .err(funContext, "An error occurred while performing requests:\n", e)
+  })
 
-  resp_raw <- httr2::req_perform_sequential(requests, on_error = "continue")
-  if (raw) {
-    return(resp_raw)
-  }
+  if (raw) return(resp_raw)
 
   responses <- lapply(seq_along(resp_raw), function(i){
     resp <- resp_raw[[i]]
+    if(is.null(resp)) return(NA_character_)
     tryCatch(
       {
         .parse_resp_json(resp)
@@ -107,7 +115,7 @@ annotatePubchemCompound <- function(
   })
 
   # apply the parse function to each response depending on heading
-  parsed_responses <- .bplapply(responses, function(response) {
+  parsed_responses <- parallel::mclapply(responses, function(response) {
     switch(heading,
       "ChEMBL ID" = .parseCHEMBLresponse(response),
       "CAS" = .parseCASresponse(response),
@@ -128,7 +136,10 @@ annotatePubchemCompound <- function(
         }
       )
     )
-  })
+  },
+  mc.cores = nParallel 
+)
+
 
   sapply(parsed_responses, .replace_null)
 

diff --git a/R/pubchem_view_helpers.R b/R/pubchem_view_helpers.R
@@ -4,13 +4,15 @@
 #'
 #' @keywords internal
 #' @noRd
-.get_all_heading_types <- function() {
+.get_all_heading_types_base <- function() {
   url <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/annotations/headings/JSON"
   req <- .build_pubchem_request(url)
   response <- httr2::req_perform(req) |> .parse_resp_json()
   .asDT(response[[1]][[1]])
 }
 
+#' @keywords internal
+.get_all_heading_types <- memoise::memoise(.get_all_heading_types_base)
 
 #' Build a PubChem REST query URL
 #'
@@ -41,7 +43,10 @@
 #' @keywords internal
 #' @noRd
 .build_pubchem_view_query <- function(
-    id, annotation = "data", record = "compound", page = NULL, version = NULL, heading = NULL, source = NULL, output = "JSON", ...) {
+    id, annotation = "data", record = "compound", 
+    page = NULL, version = NULL, heading = NULL, source = NULL,
+    output = "JSON", ...
+) {
   funContext <- .funContext(".build_pubchem_view_query")
 
 
@@ -60,16 +65,7 @@
                  has no substance headings"
       )
     } else {
-      check <- checkmate::check_character(
-        unique(getPubchemAnnotationHeadings(record, heading)$Heading),
-        min.chars = 1, min.len = 1
-      )
-      if (!isTRUE(check)) {
-        .err(
-          funContext, "Invalid heading: ", heading,
-          ". Use getPubchemAnnotationHeadings() to get valid headings."
-        )
-      }
+      checkmate::assert(heading %in% .get_all_heading_types()$Heading)
     }
     opts_ <- c(opts_, list(heading = heading))
   }
@@ -99,7 +95,7 @@
 
   url |>
     httr2::url_build() |>
-    .build_pubchem_request()
+    .build_request()
 }
 
 #' Generic function to parse one of the annotation helpers

diff --git a/R/utils-general.R b/R/utils-general.R
@@ -6,37 +6,6 @@
 #' @noRd
 .asDT <- function(x, ...) data.table::as.data.table(x, ...)
 
-
-
-#' Custom wrapper function for parallelizing lapply using BiocParallel.
-#'
-#' This function provides a convenient way to parallelize the lapply function
-#' using the BiocParallel package. It takes a list or vector \code{X} and applies
-#' the function \code{FUN} to each element in parallel. The parallelization is
-#' controlled by the \code{BPPARAM} argument, which defaults to the SerialParam
-#' object from BiocParallel.
-#'
-#' @param X A list or vector to apply the function to.
-#' @param FUN The function to apply to each element of \code{X}.
-#' @param ... Additional arguments to pass to \code{FUN}.
-#' @param BPPARAM A BiocParallel parameter object controlling the parallelization.
-#' @inheritParams BiocParallel::bplapply
-#' @return A list containing the results of applying \code{FUN} to each element of \code{X}.
-#'
-#' @import BiocParallel
-#'
-#' @examples
-#' # Apply a function to a list in parallel
-#' x <- list(1, 2, 3, 4, 5)
-#' .bplapply(x, function(x) x^2)
-#'
-#' @keywords internal
-#' @noRd
-.bplapply <- function(X, FUN, ..., BPPARAM = BiocParallel::SerialParam()) {
-  BiocParallel::bplapply(X, FUN, ..., BPPARAM = BPPARAM)
-}
-
-
 #' Parses the query response into a data table
 #'
 #' This function takes a query response and converts it into a data table using the `as.data.table` function from the `data.table` package.

diff --git a/R/utils-httr2.R b/R/utils-httr2.R
@@ -16,7 +16,7 @@
 #' @keywords internal
 .build_request <- function(url) {
   httr2::request(url) |>
-    httr2::req_retry(max_tries = 3) |>
+    httr2::req_retry(max_tries = 5, backoff = ~ 10) |>
     httr2::req_error(is_error = \(resp) FALSE)
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,8 @@ docs @@
     /Meta/
     .lintr
     covr
-    TRASH
+    TRASH
+    Treatment-Annotation*.Rmd
+    ./*.csv
+    CCLE_treatmentMetadata.csv