From 77a99f3d9b72f87c880fcaceb06c3369e18559cb Mon Sep 17 00:00:00 2001 From: Jermiah Date: Wed, 13 Dec 2023 20:56:03 +0000 Subject: [PATCH] fix bugs in parseJSON, add multiCID functions --- NAMESPACE | 1 + R/getPubChem-helpers.R | 2 +- R/getPubChem.R | 97 +++++++++++++++++-------------- man/dot-getPubChemAnnotationDT.Rd | 11 ++++ man/downloadAndExtract.Rd | 2 +- man/getPubChemAnnotation.Rd | 5 +- man/getPubChemAnnotations.Rd | 27 +++++++++ 7 files changed, 96 insertions(+), 49 deletions(-) create mode 100644 man/dot-getPubChemAnnotationDT.Rd create mode 100644 man/getPubChemAnnotations.Rd diff --git a/NAMESPACE b/NAMESPACE index c5d2a4d..197e657 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ export(getGencodeGRangesAnnotated) export(getGuideToPharm) export(getInfoFromCelllineInput) export(getPubChemAnnotation) +export(getPubChemAnnotations) export(getPubChemCompound) export(getPubChemFromNSC) export(getPubChemSubstance) diff --git a/R/getPubChem-helpers.R b/R/getPubChem-helpers.R index ac514c1..24ee64c 100644 --- a/R/getPubChem-helpers.R +++ b/R/getPubChem-helpers.R @@ -23,7 +23,7 @@ print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds.")) Sys.sleep(60) }else{ - Sys.sleep(5) + Sys.sleep(max(as.numeric(percentages))) } return(as.integer(percentage) > 15) diff --git a/R/getPubChem.R b/R/getPubChem.R index 948273b..b59fd28 100644 --- a/R/getPubChem.R +++ b/R/getPubChem.R @@ -290,13 +290,24 @@ queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, #' @export parseJSON <- function(response, ..., encoding='UTF-8', query_only=FALSE) { if (isTRUE(query_only)) return(response) + response <- content(CAS, encoding = "UTF-8", as='text', type='JSON') + + if (is.null(response)) return(NULL) + if (is.na(response)) return(NA) + tryCatch({ - fromJSON(content(response, ..., as='text', type='JSON', - encoding=encoding)) + fromJSON(response, ...) }, error=function(e) { - fromJSON(content(response, ..., type='JSON', encoding=encoding)) + NA }) + # tryCatch({ + # fromJSON(content(response, ..., as='text', type='JSON', + # encoding=encoding)) + # }, + # error=function(e) { + # fromJSON(content(response, ..., type='JSON', encoding=encoding)) + # }) } #' Query the PubChem REST API, with the result automatically converted from @@ -768,7 +779,6 @@ getPubChemAnnotation <- function( compound, annotationType = 'ChEMBL ID', url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound', - output = 'JSON', timeout_s = 29, retries = 3, quiet = TRUE, @@ -786,8 +796,8 @@ getPubChemAnnotation <- function( # TODO:: add a check to see if the compound is a valid CID or SID # TODO:: allow for variaitons of headers due to spelling errors # Temporary: - if(header == "DILI") queryURL <- paste0(.buildURL(url, compound, output), '?heading=', "Drug Induced Liver Injury") - else queryURL <- paste0(.buildURL(url, compound, output), '?heading=', header) + if(header == "DILI") queryURL <- paste0(.buildURL(url, compound, 'JSON'), '?heading=', "Drug Induced Liver Injury") + else queryURL <- paste0(.buildURL(url, compound, 'JSON'), '?heading=', header) tryCatch({ result <- RETRY('GET', URLencode(queryURL), times = retries, quiet = quiet) @@ -798,58 +808,57 @@ getPubChemAnnotation <- function( .checkThrottlingStatus(result, throttleMessage = throttleMessage) result <- parseJSON(result) - # switch(header, - # 'ATC Code'=return(.parseATCannotations(annotationDT)), - # 'Drug Induced Liver Injury'=return(.parseDILIannotations(annotationDT)), - # 'NSC Number'=return(.parseNSCannotations(annotationDT)), - # 'CTD Chemical-Gene Interactions'=return(.parseCTDannotations(annotationDT)), - # 'Names and Synonyms'=return(.parseNamesAndSynonyms(annotationDT)), - # 'Synonyms and Identifiers'=return(.parseSynonymsAndIdentifiers(annotationDT)), - # 'CAS'=return(.parseCASannotations(annotationDT)), - # tryCatch({ - # parseFUN(annotationDT) - # }, - # error=function(e) { - # .warning(funContext, 'The parseFUN function failed: ', e, - # '. Returning unparsed results instead. Please test the parseFUN - # on the returned data.') - # return(annotationDT) - # }) - # ) - - - if (header == 'ChEMBL ID') { - result <- .parseCHEMBLresponse(result) - }else if (header == 'NSC Number'){ - result <- .parseNSCresponse(result) - }else if (header == 'DILI' || header =='Drug Induced Liver Injury'){ - result <- .parseDILIresponse(result) - }else if (header == 'CAS'){ - result <- .parseCASresponse(result) - }else if (header == 'ATC Code'){ - result <- .parseATCresponse(result) - } - - # Using switch instead of if statements result <- switch( header, 'ChEMBL ID' = .parseCHEMBLresponse(result), 'NSC Number' = .parseNSCresponse(result), 'DILI' = .parseDILIresponse(result), 'CAS' = .parseCASresponse(result), - 'ATC Code' = .parseATCresponse(result) - ) - + 'ATC Code' = .parseATCresponse(result)) + if (is.null(result)) result <- list(compound, "N/A") else result <- list(compound,result) + names(result) <- c("cid", header) return(result) } - + + +#' Retrieve PubChem annotations for a given compound +#' +#' This function retrieves PubChem annotations for a given compound using the specified annotations. +#' +#' @param compound The compound for which PubChem annotations are to be retrieved. +#' @param annotations A character vector specifying the annotations to retrieve. +#' @param ... Additional arguments to be passed to getPubChemAnnotation(). +#' +#' @return A merged data table containing the PubChem annotations for the specified compound. +#' +#' @examples +#' getPubChemAnnotations( +#' compound = "36314", +#' annotations= c('ChEMBL ID', 'NSC Number', 'Drug Induced Liver Injury')) +#' +#' @export +getPubChemAnnotations <- function(compound, annotations, ...){ + result <- lapply(annotations, .getPubChemAnnotationDT, compound = compound, ...) + names(result) <- annotations + Reduce(function(x, y) merge(x, y, by = "cid", all.x = TRUE), result) +} + + +#' Function that returns a DT of getPubChemAnnotation results +.getPubChemAnnotationDT <- function(compound, annotationType, ...){ + result <- getPubChemAnnotation(compound, annotationType, ...) + data.table::as.data.table(result) +} + + + #' Function that parses the results of the PubChem PUG-VIEW API for the CHEMBL ID header .parseCHEMBLresponse <- function(result){ result <- result$Record$Reference$SourceID - result <- gsub("::Compound", "", result) + result <- gsub("Compound::", "", result) return(result) } diff --git a/man/dot-getPubChemAnnotationDT.Rd b/man/dot-getPubChemAnnotationDT.Rd new file mode 100644 index 0000000..9a28910 --- /dev/null +++ b/man/dot-getPubChemAnnotationDT.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.getPubChemAnnotationDT} +\alias{.getPubChemAnnotationDT} +\title{Function that returns a DT of getPubChemAnnotation results} +\usage{ +.getPubChemAnnotationDT(compound, annotationType, ...) +} +\description{ +Function that returns a DT of getPubChemAnnotation results +} diff --git a/man/downloadAndExtract.Rd b/man/downloadAndExtract.Rd index 66fa13f..7d0735d 100644 --- a/man/downloadAndExtract.Rd +++ b/man/downloadAndExtract.Rd @@ -24,5 +24,5 @@ the specified \code{extract_fun} for more details.} Download a compressed file from a remote URL and extract it. } \seealso{ -\link[utils:unzip]{utils::unzip}, \link[utils:untar]{utils::untar}, \link[R.utils:gunzip]{R.utils::gunzip}, \link[R.utils:bunzip2]{R.utils::bunzip2} +\link[utils:unzip]{utils::unzip}, \link[utils:untar]{utils::untar}, \link[R.utils:compressFile]{R.utils::gunzip}, \link[R.utils:compressFile]{R.utils::bunzip2} } diff --git a/man/getPubChemAnnotation.Rd b/man/getPubChemAnnotation.Rd index 443d922..c43f552 100644 --- a/man/getPubChemAnnotation.Rd +++ b/man/getPubChemAnnotation.Rd @@ -8,7 +8,6 @@ getPubChemAnnotation( compound, annotationType = "ChEMBL ID", url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound", - output = "JSON", timeout_s = 29, retries = 3, quiet = TRUE, @@ -20,8 +19,6 @@ getPubChemAnnotation( \item{url}{\code{character(1)} The URL to perform API queries on. default = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound'} -\item{output}{\code{character(1)} The output format. Defaults to 'JSON'.} - \item{timeout_s}{\code{numeric(1)} The number of seconds to wait before timing out. Default is 29.} \item{retries}{\code{numeric(1)} The number of times to retry a failed query. Default is 3.} @@ -31,6 +28,8 @@ getPubChemAnnotation( \item{throttleMessage}{\code{logical(1)} Should a message be printed when the query is throttled? Default is FALSE.} \item{header}{\code{character(1)} A valid header name for the PUG VIEW annotations} + +\item{output}{\code{character(1)} The output format. Defaults to 'JSON'.} } \description{ queries the PubChem PUG-VIEW API to get a single annotation using a CID for a header diff --git a/man/getPubChemAnnotations.Rd b/man/getPubChemAnnotations.Rd new file mode 100644 index 0000000..675d46e --- /dev/null +++ b/man/getPubChemAnnotations.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{getPubChemAnnotations} +\alias{getPubChemAnnotations} +\title{Retrieve PubChem annotations for a given compound} +\usage{ +getPubChemAnnotations(compound, annotations, ...) +} +\arguments{ +\item{compound}{The compound for which PubChem annotations are to be retrieved.} + +\item{annotations}{A character vector specifying the annotations to retrieve.} + +\item{...}{Additional arguments to be passed to getPubChemAnnotation().} +} +\value{ +A merged data table containing the PubChem annotations for the specified compound. +} +\description{ +This function retrieves PubChem annotations for a given compound using the specified annotations. +} +\examples{ +getPubChemAnnotations( + compound = "36314", + annotations= c('ChEMBL ID', 'NSC Number', 'Drug Induced Liver Injury')) + +}