refactor: fix error and add new functions for PubChem API. Start vign…

…ette for pubchem.
bhklab · Mar 7, 2024 · 0668580 · 0668580
1 parent 109f3ed
commit 0668580
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 32 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: AnnotationGx
 Title: AnnotationGx: A package for building, updating and querying an
     annotation database for pharmaco-genomic data
-Version: 0.0.0.9071
+Version: 0.0.0.9073
 Authors@R: c(
     person("Jermiah", "Joseph", role = c("aut", "cre"),
         email = "[email protected]"),
@@ -33,6 +33,8 @@ Suggests:
     knitr,
     rmarkdown
 Config/testthat/edition: 3
+Config/testthat/parallel: true
+Config/testthat/start-first: watcher, parallel*
 License: GPL (>= 3) + file LICENSE
 LazyData: true
 VignetteBuilder: knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,12 +8,15 @@ export(getChemblResourceFields)
 export(getPubchemAnnotationHeadings)
 export(getPubchemCompound)
 export(mapCell2Accession)
+export(mapCompound2CID)
 export(standardize_names)
 import(BiocParallel)
 import(data.table)
 importFrom(checkmate,assert)
 importFrom(checkmate,assert_atomic)
 importFrom(checkmate,assert_choice)
+importFrom(checkmate,assert_integerish)
 importFrom(checkmate,assert_logical)
+importFrom(checkmate,test_atomic)
 importFrom(checkmate,test_choice)
 importFrom(data.table,":=")
diff --git a/R/3_httr2_utils.R b/R/3_httr2_utils.R
@@ -45,8 +45,8 @@
 #' @param resp The response object from the HTTP request.
 #' @return The parsed JSON response.
 #' @noRd
-.parse_resp_json <- function(resp){
-    httr2::resp_body_json(resp, simplifyVector = TRUE)
+.parse_resp_json <- function(resp, simplifyVector = TRUE){
+    httr2::resp_body_json(resp, simplifyVector = simplifyVector)
 }
 
 

diff --git a/R/pubchem_helpers.R b/R/pubchem_helpers.R
@@ -37,11 +37,11 @@ getPubchemStatus <- function(
     }
     # Check if the request count or request time is
     if(parsed_info$service$status == "Black"){
-        message("WARNING: The request limit has been exceeded and requests are being blocked.")
+        .warn("The request limit has been exceeded and requests are being blocked.")
     }else if(parsed_info$service$status %in% c("Red", "Yellow")){
-        message("WARNING: The request limit has been reached or is close to being reached.")
+        .warn("The request limit has been reached or is close to being reached.")
     }else{
-        message("The request limit is not close to being reached.")
+        .warn("The request limit is not close to being reached.")
     }
     return(parsed_info)
 }

diff --git a/R/pubchem_rest.R b/R/pubchem_rest.R
@@ -24,29 +24,37 @@ getPubchemCompound <- function(
     ){
     funContext <- .funContext("getPubchemCompound")
 
-    if(to == 'property'){
+
+    to_ <- if(to == 'property'){
         checkmate::assert_atomic(properties, all.missing = FALSE)
         checkmate::assert_character(properties)
         to <- paste0(to, '/', paste0(properties, collapse = ','))
-    }
+    }else to
 
     requests <- lapply(ids, function(x) {
         .build_pubchem_rest_query(
-            id = x, domain = 'compound', namespace = from, operation = to, output = output,
+            id = x, domain = 'compound', namespace = from, operation = to_, output = output,
             raw = raw, query_only = query_only, ...)
         }
     )
     if(query_only) return(requests)
 
     resps_raw <- httr2::req_perform_sequential(requests, on_error = "continue")
     .debug(funContext, " Number of responses: ", length(resps_raw))
-
+    names(resps_raw) <- ids
     if(raw) return(resps_raw)
-    resps <- lapply(resps_raw, function(x){
-        .parse_resp_json(x) |> .parseQueryToDT()
-    })
 
-    names(resps) <- ids
+
+    # Parse the responses
+    resps <- .parse_pubchem_rest_responses(resps_raw)
+    failed <- sapply(resps_raw, httr2::resp_is_error, USE.NAMES = T)
+
+    if(any(failed)){
+        .warn(funContext, " Some queries failed. See the 'failed' object for details.")
+        failures <- lapply(resps_raw[failed], function(resp){
+            .parse_resp_json(resp)$Fault
+        })
+    }else failures <- NULL
 
     if(from != 'name'){
         responses <- data.table::rbindlist(resps, fill= TRUE)
@@ -55,10 +63,55 @@ getPubchemCompound <- function(
     }
     data.table::setnames(responses, 'V1', to, skip_absent=TRUE)
 
+    attributes(responses)$failed <- failures 
+
     responses
 }
 
 
+#' Map compound names to PubChem CIDs
+#'
+#' This function maps compound names to PubChem CIDs using the PubChem REST API.
+#'
+#' @param names A character vector of compound names.
+#' @param raw Logical indicating whether to return the raw response from the API (default is FALSE).
+#' @param query_only Logical indicating whether to only perform the query without retrieving the data (default is FALSE).
+#' @param output The format of the output, either 'JSON' or 'XML' (default is 'JSON').
+#' @param ... Additional arguments to be passed to the getPubchemCompound function.
+#'
+#' @return A character vector of PubChem CIDs.
+#'
+#' @examples
+#' mapCompound2CID(c("aspirin", "caffeine"))
+#'
+#' @export
+mapCompound2CID <- function(names, raw = FALSE, query_only = FALSE, output = 'JSON', ...){
+    getPubchemCompound(ids = names, from = 'name', to = 'cids', raw = raw, query_only = query_only, output = output, ...)
+}
+
+.parse_pubchem_rest_responses <- function(responses){
+    checkmate::assert_list(
+        x = responses,
+        any.missing = FALSE,
+        names = 'named',
+        min.len = 1
+    )
+
+    responses_parsed <- lapply(names(responses), function(i){
+        resp <- responses[[i]]
+        body <- .parse_resp_json(resp)
+        if(httr2::resp_is_error(resp)) return(.parseQueryToDT(NA_integer_))
+
+        return(.parseQueryToDT(body))
+    })
+    names(responses_parsed) <- names(responses)
+    return(responses_parsed)
+
+}
+
+
+
+
 
 #' Build a query for the PubChem REST API
 #'
@@ -76,7 +129,7 @@ getPubchemCompound <- function(
 #'
 #' @return The query URL or the parsed response, depending on the arguments.
 #'
-#' @importFrom checkmate assert assert_choice assert_logical assert_atomic test_choice
+#' @importFrom checkmate assert assert_choice assert_logical assert_atomic test_choice assert_integerish test_atomic
 #'
 #' @keywords internal
 .build_pubchem_rest_query <- function(
@@ -102,8 +155,9 @@ getPubchemCompound <- function(
     )
     assert_choice(output, c('JSON', 'XML', 'SDF', 'TXT', 'CSV'))
     assert_logical(raw, query_only)
-    assert_atomic(id, all.missing = FALSE)
+    if(!test_atomic(id, any.missing = FALSE)) .err("id must be an atomic vector with no missing/NA values")
 
+    if(namespace == 'cid') assert_integerish(id)
 
     # -------------------------------------- Function context --------------------------------------
     funContext <- .funContext("query_pubchem_rest")

diff --git a/man/mapCompound2CID.Rd b/man/mapCompound2CID.Rd
diff --git a/tests/testthat/test_pubchem_helpers.R b/tests/testthat/test_pubchem_helpers.R
@@ -22,21 +22,20 @@ resp_ <- .buildURL(url) |> .build_pubchem_request() |> httr2::req_perform()
 test_that("checkThrottlingStatus Works", {
     response <- resp_
 
-    response$headers$`x-throttling-control` <-
-      "Request Count status: Yellow (60%), Request Time status: Yellow (60%), Service status: Yellow (60%)"
+    response$headers["X-Throttling-Control"] <- "Request Count status: Yellow (60%), Request Time status: Yellow (60%), Service status: Yellow (60%)"
     parsed_info <- AnnotationGx:::.checkThrottlingStatus2(response, printMessage = FALSE)
     expect_equal(parsed_info, list(request_count = list(status = "Yellow", percent = 60),
                                    request_time = list(status = "Yellow", percent = 60),
                                    service = list(status = "Yellow", percent = 60)))
 
-    response$headers$`x-throttling-control` <-
+    response$headers["X-Throttling-Control"] <- 
       "Request Count status: Red (80%), Request Time status: Red (80%), Service status: Red (80%)"
     parsed_info <- AnnotationGx:::.checkThrottlingStatus2(response, printMessage = FALSE)
     expect_equal(parsed_info, list(request_count = list(status = "Red", percent = 80),
                                    request_time = list(status = "Red", percent = 80),
                                    service = list(status = "Red", percent = 80)))
 
-    response$headers$`x-throttling-control` <-
+    response$headers["X-Throttling-Control"] <- 
       "Request Count status: Black (100%), Request Time status: Red (80%), Service status: Red (80%)"
     parsed_info <- AnnotationGx:::.checkThrottlingStatus2(response, printMessage = FALSE)
     expect_equal(parsed_info, list(request_count = list(status = "Black", percent = 100),

diff --git a/tests/testthat/test_pubchem_rest.R b/tests/testthat/test_pubchem_rest.R
@@ -4,11 +4,9 @@ library(checkmate)
 
 compounds <- c('temozolomide', 'erlotinib', 'TRETINOIN', 'TRAMETINIB', 'epigallocatechin-3-monogallate')
 
+
 # Comprehensive Tests:
 test_that("AnnotationGx::getPubchemCompound 5 Correct Drugs", {
-    # Test for all possible combinations of domain, namespace, operation, and output
-    domains <- c('compound')
-
     expected_cids <- c(5394, 176870, 444795, 11707110, 65064)
 
     result <- getPubchemCompound(ids = compounds, from = 'name', to = 'cids')
@@ -24,22 +22,66 @@ test_that("AnnotationGx::getPubchemCompound 5 Correct Drugs", {
 
 test_that("AnnotationGx::getPubchemCompound 1 Incorrect Drug", {
     # Test for an incorrect drug, scoped so it doesnt affect the other tests
-    compounds <- c('BAD_DRUG_NAME', compounds)
-    getPubchemCompound(ids = compounds, from = 'name', to = 'cids')
+    compounds <- c('BAD_DRUG_NAME')
+    result <- getPubchemCompound(ids = compounds, from = 'name', to = 'cids')
+    expect_data_table(
+        x = result,
+        types = c('character', 'integer'),
+        ncols = 2,
+        nrows = length(compounds),
+        col.names = 'named'
+    )
+
+    failed_queries <- attributes(result)$failed
+
+    expect_list(
+        failed_queries,
+        len = 1,
+        any.missing = FALSE,
+        names = 'named'
+    )
+
+    expect_equal(names(failed_queries), c('BAD_DRUG_NAME'))
+})
 
-    result <- getPubchemCompound('BAD', from='name', to = 'cids', raw = T)[[1]] |> 
-    AnnotationGx:::.parse_resp_json() |> 
-    AnnotationGx:::.parseQueryToDT()
+test_that("AnnotationGx::getPubchemCompound bad input", {
+
+    data(ctrp_treatmentIDs)
+    expect_error(getPubchemCompound(ctrp_treatmentIDs))
+
+})
 
+test_that("AnnotationGx::getPubchemCompound 2 Incorrect Drugs in a list", {
+    # Test for an incorrect drug, scoped so it doesnt affect the other tests
+    compounds <- c('BAD_DRUG_NAME', compounds, 'Another bad drug')
+    result <- getPubchemCompound(ids = compounds, from = 'name', to = 'cids')
     expect_data_table(
         x = result,
         types = c('character', 'integer'),
-        any.missing = FALSE,
-        ncols = 1,
-        nrows = 1,
+        ncols = 2,
+        nrows = length(compounds),
         col.names = 'named'
     )
-
+
+    failed_queries <- attributes(result)$failed
+
+    expect_list(
+        failed_queries,
+        len = 2,
+        any.missing = FALSE,
+        names = 'named'
+    )
+
+    expect_equal(names(failed_queries), c('BAD_DRUG_NAME', 'Another bad drug'))
+})
+
+test_that("AnnotationGx::getPubchemCompound errors if cid and not integer", {
+    expect_error(
+        AnnotationGx::getPubchemCompound(
+            ids= c(5394, 'PUGREST.BadRequest'), 
+            from = 'cid', to = 'property', 
+            properties = c('Title', 'MolecularFormula', 'InChIKey', 'CanonicalSMILES')
+    ))
 })
 
 

diff --git a/tests/testthat/testthat-problems.rds b/tests/testthat/testthat-problems.rds
diff --git a/vignettes/PubChemAPI.Rmd b/vignettes/PubChemAPI.Rmd
@@ -42,11 +42,44 @@ UniProt, ChEBI, and ChEMBL, given a specific identifier.
 
 ```{r setup}
 library(AnnotationGx)
+
+
+
 ```
 
 
 # 
 
+### Mapping from chemical name to PubChem CID
+The main function that is provided by the package is `mapCompound2CID`. 
+
+``` {r map aspirin to cid}
+mapCompound2CID("aspirin")
+```
+
+In the case of a compound that can't be mapped, `NA` will be returned and a warning will be issued.
+
+``` {r map non existent compound to cid}
+
+(result <- mapCompound2CID(c("non existent compound", "another bad compound")))
+
+failed <- attributes(result)$failed
+
+# get the list of failed inputs 
+names(failed)
+
+# get the error message for the failed input
+failed[1]
+
+```
+
+### Mapping from PubChem CID to Properties
+TODO::
+
+
+### Mapping from PubChem CID to Annotations
+TODO::
+
 
 # References
 1. PUG REST. PubChem Docs [website]. Retrieved from https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest.