fix errors in pubchem queries and modify sleep

bhklab · Dec 14, 2023 · f2877fe · f2877fe
1 parent f06e4c5
commit f2877fe
Show file tree

Hide file tree

Showing 7 changed files with 326 additions and 307 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: AnnotationGx
 Title: AnnotationGx: A package for building, updating and querying an
     annotation database for pharmaco-genomic data.
-Version: 0.0.5.9001
+Version: 0.0.5.9002
 Authors@R: c(
     person("Christopher", "Eeles", role = c("aut"),
         email = "[email protected]"),

diff --git a/R/getChEMBL.R b/R/getChEMBL.R
@@ -131,13 +131,27 @@ getChemblAllMechanisms <- function(url="https://www.ebi.ac.uk",
 #'| search                   | Special type of filter allowing a full text search based on elastic search queries                           |
 #'| only                     | Select specific properties from the original endpoint and returns only the desired properties on each record |                                                                                                         |
 #'
+#' @param resource `character(1)` Resource to query
+#' @param field `character(1)` Field to query
+#' @param filter_type `character(1)` Filter type
+#' @param value `character(1)` Value to query
+#' 
+#' 
 #' @md
 #' @export
-constructChemblQuery <- function(resource, field, filter_type, value){
+constructChemblQuery <- function(resource, field, filter_type, value, format = "json"){
+
+    # possible formats for now are XML, JSON and YAML
+    checkmate::assert_character(format)
+    checkmate::assert_character(resource)
+    checkmate::assert_character(field)
+    checkmate::assert_character(filter_type)
+    checkmate::assert_character(value)
+    checkmate::assert_subset(format, c("json", "xml", "yaml"))
 
     url <- "https://www.ebi.ac.uk/chembl/api/data/"
 
-    final <- paste0(url, resource, ".json", "?", field, "__", filter_type, "=", value)
+    final <- paste0(url, resource, "?", "format=", format, "&", field, "__", filter_type, "=", value)
 
     return (final)
 }
@@ -251,31 +265,40 @@ getChemblMechanism <- function(
     filter_type = "in",
     returnURL = FALSE){
 
-
     # constructChemblQuery(resource = "mechanism", field = "molecule_chembl_id", filter_type = "in", value = "CHEMBL1413")
-    urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID)
-    urls <- URLencode(urls)
+    # urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID)
+    # urls <- URLencode(urls)
 
-    if(returnURL) return(urls)
-
-    # [1] "action_type"               "binding_site_comment"     
-    # [3] "direct_interaction"        "disease_efficacy"         
-    # [5] "max_phase"                 "mec_id"                   
-    # [7] "mechanism_comment"         "mechanism_of_action"      
-    # [9] "mechanism_refs"            "molecular_mechanism"      
-    # [11] "molecule_chembl_id"        "parent_molecule_chembl_id"
-    # [13] "record_id"                 "selectivity_comment"      
-    # [15] "site_id"                   "target_chembl_id"         
-    # [17] "variant_sequence"         
-
-    responses <- data.table::rbindlist(lapply(urls, function(url){
+    # if(returnURL) return(urls)
+
+    cols <- c("molecule_chembl_id", "action_type", 
+    "mechanism_of_action", "molecular_mechanism", 
+    "mechanism_comment", "parent_molecule_chembl_id", "target_chembl_id")
+
+    responses <- lapply(chembl.ID, function(ID){
+        url <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = ID)
         response <- httr::GET(url)
-        response <- AnnotationGx::parseJSON(response)
+        response <- parseJSON(response)
         mechanisms <- data.table::as.data.table(response$mechanisms)
-        result <- mechanisms[,.(molecule_chembl_id, action_type, mechanism_of_action, molecular_mechanism, mechanism_comment, parent_molecule_chembl_id, target_chembl_id)]
-    }))
+
+        # if cols are not in names then return an empty data.table with cols 
+        if(!all(cols %in% names(mechanisms))){
+            mechanisms <- data.table::data.table()
+            mechanisms[, (cols) := NA]
+            mechanisms[, "molecule_chembl_id" := ID]
+            return(mechanisms)
+        }
+        return(mechanisms[,..cols])
+    })
 
-    return(responses)
+    # replace the _ in the column names with .
+    responses <- lapply(responses, function(x){
+        names(x) <- gsub("_", ".", names(x))
+        return(x)
+    })
+
+    return(data.table::rbindlist(responses, fill = TRUE))
+
 }
 
 

diff --git a/R/getPubChem-helpers.R b/R/getPubChem-helpers.R
@@ -8,52 +8,37 @@
     if (throttleMessage == TRUE){
         message(message)
     }
+    # Request Count status: Green (5%), Request Time status: Green (0%), Service status: Green (30%)
+
+
     matches <- regmatches(message, gregexpr("\\((.*?)%\\)", message))  # Extracts text within parentheses
-    percentages <- gsub("\\(|%|\\)", "", unlist(matches[1:3]))
-    # print(percentages)
-    percentage <- max(as.numeric(percentages))
-    if(as.integer(percentage) > 15 && as.integer(percentage) < 30){
-        Sys.sleep(15)
-    }else if (as.integer(percentage) > 30 && as.integer(percentage) < 50){
-        Sys.sleep(20)
-    }else if (as.integer(percentage) > 50 && as.integer(percentage) < 75) {
-        print(paste0("Throttling at ", percentage, "%. Sleeping for 30 seconds."))
-        Sys.sleep(30)
-    }else if (as.integer(percentage) > 75) {
-        print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds."))
-        Sys.sleep(60)
-    }else{
-        Sys.sleep(max(as.numeric(percentages)))
-    }   
+    percentages <- as.numeric(gsub("\\(|%|\\)", "", unlist(matches[1:3])))
+    percentage <- max(percentages[1:3])
+    # percentage <- ifelse(percentages[3] > 75, percentages[3], max(percentages[1:2]))
+    sleep_time <- ifelse(percentage > 50, 30, ifelse(percentage > 30, 20, 15))
 
-    return(as.integer(percentage) > 15)
+    if (percentage > 15) {
+        # message(paste0("Sleeping for ", sleep_time, " seconds"))
+        Sys.sleep(sleep_time)
+    } else {
+        Sys.sleep(percentage)
+    }
+
+    return(percentage > 15)
 }
 
-## TODO:: Retrieve PubChem server status to dynamically set query spacing
-##>based on server load
-## TODO:: Make the query away for server load status in response header
-#' @importFrom crayon strip_style
-.queryPubChemSleep <- function(x, ..., query_only=FALSE) {
-    proxy <- list(...)$proxy
-    t1 <- Sys.time()
-    queryRes <- tryCatch({
-        queryRequestPubChem(x, ..., query_only=query_only)
-    },
-        error=function(e) {
-            cat('\r')
-            print(e)
-        list(Error=list(
-            Code='.queryPubChemSleep.ERROR',
-            Message='See Details for error msg',
-            Details=paste0(strip_style(e), collapse=' ')
-        ))
-    })
-    t2 <- Sys.time()
-    queryTime <- t2 - t1
-    if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime)
-    return(queryRes)
+#' simple checkThrottling Status by a default response 
+#' @return `logical` whether the query is throttled
+#' @export
+getPubChemStatus <- function(sleep = FALSE){
+    response <- httr::GET("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/Aspirin/cids/JSON")
+    msg <- headers(response)$`x-throttling-control`
+    matches <- regmatches(msg, gregexpr("\\((.*?)%\\)", msg))  # Extracts text within parentheses
+    if(sleep) .checkThrottlingStatus(response)
+    message(msg)
 }
 
+
 # -----------------------------
 # getPubChemAnnotations Helpers
 
@@ -96,75 +81,75 @@
     return(DT)
 }
 
-#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
-.parseNSCannotations <- function(DT) {
-    DT[, NSC := unlist(lapply(Data, `[[`, i=4))]
-    annotationDT <- DT[,
-        .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)),
-        by=.(SourceName, SourceID, NSC)]
-    return(annotationDT)
-}
+# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
+# .parseNSCannotations <- function(DT) {
+#     DT[, NSC := unlist(lapply(Data, `[[`, i=4))]
+#     annotationDT <- DT[,
+#         .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)),
+#         by=.(SourceName, SourceID, NSC)]
+#     return(annotationDT)
+# }
 
-#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
-.parseCTDannotations <- function(DT) {
-    annotationDT <- DT[, .(CID=unlist(LinkedRecords)),
-        by=.(SourceName, SourceID, URL)]
-    annotationDT[, CTD := gsub('::.*$', '', SourceID)]
-    return(annotationDT)
-}
+# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
+# .parseCTDannotations <- function(DT) {
+#     annotationDT <- DT[, .(CID=unlist(LinkedRecords)),
+#         by=.(SourceName, SourceID, URL)]
+#     annotationDT[, CTD := gsub('::.*$', '', SourceID)]
+#     return(annotationDT)
+# }
 
-#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames
-.parseCASannotations <- function(list) {
-    # Make sure CIDs all go in the same column
-    CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords',
-        skip_absent=TRUE)
-    DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE)
-    DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))]
-    CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)]
-    ID_DT <- DT[, 
-        .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))),
-        by=.(SourceName, SourceID, Name, URL)]
-    annotationDT <- merge.data.table(
-        CAS_DT, ID_DT,
-        by=c('SourceName', 'SourceID', 'Name'), 
-        all.x=TRUE,
-        allow.cartesian=TRUE)
-    annotationDT <- unique(annotationDT)
-    return(annotationDT)
-}
+# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames
+# .parseCASannotations <- function(list) {
+#     # Make sure CIDs all go in the same column
+#     CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords',
+#         skip_absent=TRUE)
+#     DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE)
+#     DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))]
+#     CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)]
+#     ID_DT <- DT[, 
+#         .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))),
+#         by=.(SourceName, SourceID, Name, URL)]
+#     annotationDT <- merge.data.table(
+#         CAS_DT, ID_DT,
+#         by=c('SourceName', 'SourceID', 'Name'), 
+#         all.x=TRUE,
+#         allow.cartesian=TRUE)
+#     annotationDT <- unique(annotationDT)
+#     return(annotationDT)
+# }
 
-#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
-.parseSynonymsAndIdentifiers <- function(DT) {
-    dataList <- lapply(DT$Data, as.data.table)
-    names(dataList) <- DT$SourceID
-    dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE,
-        idcol='SourceID')
-    DT[, Data := NULL]
-    dataDT[,
-        Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'),
-        by=SourceID]
-    dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID]
-    dataDT[, Value.StringWithMarkup := NULL]
-    annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE)
-    setnames(annotationDT,
-        old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'),
-        new=c('Type', 'Heading', 'ID')
-    )
-    annotationDT <- unique(annotationDT)
-    return(annotationDT)
-}
+# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
+# .parseSynonymsAndIdentifiers <- function(DT) {
+#     dataList <- lapply(DT$Data, as.data.table)
+#     names(dataList) <- DT$SourceID
+#     dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE,
+#         idcol='SourceID')
+#     DT[, Data := NULL]
+#     dataDT[,
+#         Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'),
+#         by=SourceID]
+#     dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID]
+#     dataDT[, Value.StringWithMarkup := NULL]
+#     annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE)
+#     setnames(annotationDT,
+#         old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'),
+#         new=c('Type', 'Heading', 'ID')
+#     )
+#     annotationDT <- unique(annotationDT)
+#     return(annotationDT)
+# }
 
-#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
-.parseNamesAndSynonyms <- function(DT) {
-    DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])]
-    # Remove the weird annotation from the end of the synonym
-    DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')]
-    DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))]
-    # fix NULL list itemss
-    DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)]
-    DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)]
-    annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)),
-        by=.(SourceName, SourceID, Name, URL, Synonyms)]
-    annotationDT <- unique(annotationDT)
-    return(annotationDT)
-}
+# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
+# .parseNamesAndSynonyms <- function(DT) {
+#     DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])]
+#     # Remove the weird annotation from the end of the synonym
+#     DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')]
+#     DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))]
+#     # fix NULL list itemss
+#     DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)]
+#     DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)]
+#     annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)),
+#         by=.(SourceName, SourceID, Name, URL, Synonyms)]
+#     annotationDT <- unique(annotationDT)
+#     return(annotationDT)
+# }