diff --git a/DESCRIPTION b/DESCRIPTION index 4a578fd..ef46603 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: AnnotationGx Title: AnnotationGx: A package for building, updating and querying an annotation database for pharmaco-genomic data. -Version: 0.0.5.9001 +Version: 0.0.5.9002 Authors@R: c( person("Christopher", "Eeles", role = c("aut"), email = "christopher.eeles@uhnresearch.ca"), diff --git a/R/getChEMBL.R b/R/getChEMBL.R index 7701117..bbf4f9b 100644 --- a/R/getChEMBL.R +++ b/R/getChEMBL.R @@ -131,13 +131,27 @@ getChemblAllMechanisms <- function(url="https://www.ebi.ac.uk", #'| search | Special type of filter allowing a full text search based on elastic search queries | #'| only | Select specific properties from the original endpoint and returns only the desired properties on each record | | #' +#' @param resource `character(1)` Resource to query +#' @param field `character(1)` Field to query +#' @param filter_type `character(1)` Filter type +#' @param value `character(1)` Value to query +#' +#' #' @md #' @export -constructChemblQuery <- function(resource, field, filter_type, value){ +constructChemblQuery <- function(resource, field, filter_type, value, format = "json"){ + + # possible formats for now are XML, JSON and YAML + checkmate::assert_character(format) + checkmate::assert_character(resource) + checkmate::assert_character(field) + checkmate::assert_character(filter_type) + checkmate::assert_character(value) + checkmate::assert_subset(format, c("json", "xml", "yaml")) url <- "https://www.ebi.ac.uk/chembl/api/data/" - final <- paste0(url, resource, ".json", "?", field, "__", filter_type, "=", value) + final <- paste0(url, resource, "?", "format=", format, "&", field, "__", filter_type, "=", value) return (final) } @@ -251,31 +265,40 @@ getChemblMechanism <- function( filter_type = "in", returnURL = FALSE){ - # constructChemblQuery(resource = "mechanism", field = "molecule_chembl_id", filter_type = "in", value = "CHEMBL1413") - urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID) - urls <- URLencode(urls) + # urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID) + # urls <- URLencode(urls) - if(returnURL) return(urls) - - # [1] "action_type" "binding_site_comment" - # [3] "direct_interaction" "disease_efficacy" - # [5] "max_phase" "mec_id" - # [7] "mechanism_comment" "mechanism_of_action" - # [9] "mechanism_refs" "molecular_mechanism" - # [11] "molecule_chembl_id" "parent_molecule_chembl_id" - # [13] "record_id" "selectivity_comment" - # [15] "site_id" "target_chembl_id" - # [17] "variant_sequence" - - responses <- data.table::rbindlist(lapply(urls, function(url){ + # if(returnURL) return(urls) + + cols <- c("molecule_chembl_id", "action_type", + "mechanism_of_action", "molecular_mechanism", + "mechanism_comment", "parent_molecule_chembl_id", "target_chembl_id") + + responses <- lapply(chembl.ID, function(ID){ + url <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = ID) response <- httr::GET(url) - response <- AnnotationGx::parseJSON(response) + response <- parseJSON(response) mechanisms <- data.table::as.data.table(response$mechanisms) - result <- mechanisms[,.(molecule_chembl_id, action_type, mechanism_of_action, molecular_mechanism, mechanism_comment, parent_molecule_chembl_id, target_chembl_id)] - })) + + # if cols are not in names then return an empty data.table with cols + if(!all(cols %in% names(mechanisms))){ + mechanisms <- data.table::data.table() + mechanisms[, (cols) := NA] + mechanisms[, "molecule_chembl_id" := ID] + return(mechanisms) + } + return(mechanisms[,..cols]) + }) - return(responses) + # replace the _ in the column names with . + responses <- lapply(responses, function(x){ + names(x) <- gsub("_", ".", names(x)) + return(x) + }) + + return(data.table::rbindlist(responses, fill = TRUE)) + } diff --git a/R/getPubChem-helpers.R b/R/getPubChem-helpers.R index 24ee64c..6005e50 100644 --- a/R/getPubChem-helpers.R +++ b/R/getPubChem-helpers.R @@ -8,52 +8,37 @@ if (throttleMessage == TRUE){ message(message) } + # Request Count status: Green (5%), Request Time status: Green (0%), Service status: Green (30%) + + matches <- regmatches(message, gregexpr("\\((.*?)%\\)", message)) # Extracts text within parentheses - percentages <- gsub("\\(|%|\\)", "", unlist(matches[1:3])) - # print(percentages) - percentage <- max(as.numeric(percentages)) - if(as.integer(percentage) > 15 && as.integer(percentage) < 30){ - Sys.sleep(15) - }else if (as.integer(percentage) > 30 && as.integer(percentage) < 50){ - Sys.sleep(20) - }else if (as.integer(percentage) > 50 && as.integer(percentage) < 75) { - print(paste0("Throttling at ", percentage, "%. Sleeping for 30 seconds.")) - Sys.sleep(30) - }else if (as.integer(percentage) > 75) { - print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds.")) - Sys.sleep(60) - }else{ - Sys.sleep(max(as.numeric(percentages))) - } + percentages <- as.numeric(gsub("\\(|%|\\)", "", unlist(matches[1:3]))) + percentage <- max(percentages[1:3]) + # percentage <- ifelse(percentages[3] > 75, percentages[3], max(percentages[1:2])) + sleep_time <- ifelse(percentage > 50, 30, ifelse(percentage > 30, 20, 15)) - return(as.integer(percentage) > 15) + if (percentage > 15) { + # message(paste0("Sleeping for ", sleep_time, " seconds")) + Sys.sleep(sleep_time) + } else { + Sys.sleep(percentage) + } + + return(percentage > 15) } -## TODO:: Retrieve PubChem server status to dynamically set query spacing -##>based on server load -## TODO:: Make the query away for server load status in response header -#' @importFrom crayon strip_style -.queryPubChemSleep <- function(x, ..., query_only=FALSE) { - proxy <- list(...)$proxy - t1 <- Sys.time() - queryRes <- tryCatch({ - queryRequestPubChem(x, ..., query_only=query_only) - }, - error=function(e) { - cat('\r') - print(e) - list(Error=list( - Code='.queryPubChemSleep.ERROR', - Message='See Details for error msg', - Details=paste0(strip_style(e), collapse=' ') - )) - }) - t2 <- Sys.time() - queryTime <- t2 - t1 - if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime) - return(queryRes) +#' simple checkThrottling Status by a default response +#' @return `logical` whether the query is throttled +#' @export +getPubChemStatus <- function(sleep = FALSE){ + response <- httr::GET("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/Aspirin/cids/JSON") + msg <- headers(response)$`x-throttling-control` + matches <- regmatches(msg, gregexpr("\\((.*?)%\\)", msg)) # Extracts text within parentheses + if(sleep) .checkThrottlingStatus(response) + message(msg) } + # ----------------------------- # getPubChemAnnotations Helpers @@ -96,75 +81,75 @@ return(DT) } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseNSCannotations <- function(DT) { - DT[, NSC := unlist(lapply(Data, `[[`, i=4))] - annotationDT <- DT[, - .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), - by=.(SourceName, SourceID, NSC)] - return(annotationDT) -} +# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +# .parseNSCannotations <- function(DT) { +# DT[, NSC := unlist(lapply(Data, `[[`, i=4))] +# annotationDT <- DT[, +# .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), +# by=.(SourceName, SourceID, NSC)] +# return(annotationDT) +# } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseCTDannotations <- function(DT) { - annotationDT <- DT[, .(CID=unlist(LinkedRecords)), - by=.(SourceName, SourceID, URL)] - annotationDT[, CTD := gsub('::.*$', '', SourceID)] - return(annotationDT) -} +# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +# .parseCTDannotations <- function(DT) { +# annotationDT <- DT[, .(CID=unlist(LinkedRecords)), +# by=.(SourceName, SourceID, URL)] +# annotationDT[, CTD := gsub('::.*$', '', SourceID)] +# return(annotationDT) +# } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames -.parseCASannotations <- function(list) { - # Make sure CIDs all go in the same column - CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords', - skip_absent=TRUE) - DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE) - DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))] - CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)] - ID_DT <- DT[, - .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))), - by=.(SourceName, SourceID, Name, URL)] - annotationDT <- merge.data.table( - CAS_DT, ID_DT, - by=c('SourceName', 'SourceID', 'Name'), - all.x=TRUE, - allow.cartesian=TRUE) - annotationDT <- unique(annotationDT) - return(annotationDT) -} +# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames +# .parseCASannotations <- function(list) { +# # Make sure CIDs all go in the same column +# CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords', +# skip_absent=TRUE) +# DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE) +# DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))] +# CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)] +# ID_DT <- DT[, +# .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))), +# by=.(SourceName, SourceID, Name, URL)] +# annotationDT <- merge.data.table( +# CAS_DT, ID_DT, +# by=c('SourceName', 'SourceID', 'Name'), +# all.x=TRUE, +# allow.cartesian=TRUE) +# annotationDT <- unique(annotationDT) +# return(annotationDT) +# } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseSynonymsAndIdentifiers <- function(DT) { - dataList <- lapply(DT$Data, as.data.table) - names(dataList) <- DT$SourceID - dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE, - idcol='SourceID') - DT[, Data := NULL] - dataDT[, - Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'), - by=SourceID] - dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID] - dataDT[, Value.StringWithMarkup := NULL] - annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE) - setnames(annotationDT, - old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'), - new=c('Type', 'Heading', 'ID') - ) - annotationDT <- unique(annotationDT) - return(annotationDT) -} +# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +# .parseSynonymsAndIdentifiers <- function(DT) { +# dataList <- lapply(DT$Data, as.data.table) +# names(dataList) <- DT$SourceID +# dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE, +# idcol='SourceID') +# DT[, Data := NULL] +# dataDT[, +# Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'), +# by=SourceID] +# dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID] +# dataDT[, Value.StringWithMarkup := NULL] +# annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE) +# setnames(annotationDT, +# old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'), +# new=c('Type', 'Heading', 'ID') +# ) +# annotationDT <- unique(annotationDT) +# return(annotationDT) +# } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseNamesAndSynonyms <- function(DT) { - DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])] - # Remove the weird annotation from the end of the synonym - DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')] - DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))] - # fix NULL list itemss - DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)] - DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)] - annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)), - by=.(SourceName, SourceID, Name, URL, Synonyms)] - annotationDT <- unique(annotationDT) - return(annotationDT) -} +# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +# .parseNamesAndSynonyms <- function(DT) { +# DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])] +# # Remove the weird annotation from the end of the synonym +# DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')] +# DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))] +# # fix NULL list itemss +# DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)] +# DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)] +# annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)), +# by=.(SourceName, SourceID, Name, URL, Synonyms)] +# annotationDT <- unique(annotationDT) +# return(annotationDT) +# } diff --git a/R/getPubChem.R b/R/getPubChem.R index b59fd28..becd152 100644 --- a/R/getPubChem.R +++ b/R/getPubChem.R @@ -200,7 +200,7 @@ getRequestPubChem <- function(id, domain='compound', namespace='cid', operation= #' @title queryPubChem #' #' @details -#' This function automatically parses the results of the +#' This function automatically parses the results of the #' #' @inheritParams getRequestPubChem #' @param ... Fall through parameters to `bpmapply`. @@ -243,11 +243,13 @@ queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, querySize <- ceiling(length(id) / numQueries) queries <- split(id, ceiling(seq_along(id) / querySize)) + queryRes <- bplapply(queries, FUN=.queryPubChemSleep, domain=domain, namespace=namespace, operation=operation, output=output, url=url, operation_options=operation_options, BPPARAM=BPPARAM, proxy=proxy, raw=raw, query_only=query_only, verbose = verbose) } else { + queryRes <- bplapply(id, FUN=.queryPubChemSleep, domain=domain, namespace=namespace, operation=operation, output=output, url=url, operation_options=operation_options, BPPARAM=BPPARAM, proxy=proxy, @@ -274,42 +276,6 @@ queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, return(queryRes) } - -#' Parse a JSON into a list -#' -#' @param response A `response` object as returned by `httr::GET` -#' @param as A `character` vector indicating the return type. Options are 'raw', -# 'text' or 'parsed'. Default is 'text'. -#' @param ... Additional arguments to the `httr::content` function. -#' -#' @seealso [httr::content] -#' -#' @md -#' @importFrom jsonlite fromJSON -#' @importFrom httr content -#' @export -parseJSON <- function(response, ..., encoding='UTF-8', query_only=FALSE) { - if (isTRUE(query_only)) return(response) - response <- content(CAS, encoding = "UTF-8", as='text', type='JSON') - - if (is.null(response)) return(NULL) - if (is.na(response)) return(NA) - - tryCatch({ - fromJSON(response, ...) - }, - error=function(e) { - NA - }) - # tryCatch({ - # fromJSON(content(response, ..., as='text', type='JSON', - # encoding=encoding)) - # }, - # error=function(e) { - # fromJSON(content(response, ..., type='JSON', encoding=encoding)) - # }) -} - #' Query the PubChem REST API, with the result automatically converted from #' JSON to a list. This only works when `output='JSON'` in `getRequestPubChem`. #' @@ -321,110 +287,33 @@ parseJSON <- function(response, ..., encoding='UTF-8', query_only=FALSE) { queryRequestPubChem <- function(..., query_only=FALSE) parseJSON(getRequestPubChem(..., query_only=query_only), query_only=query_only) +## TODO:: Retrieve PubChem server status to dynamically set query spacing +##>based on server load +## TODO:: Make the query away for server load status in response header +#' @importFrom crayon strip_style +.queryPubChemSleep <- function(x, ..., query_only=FALSE) { + proxy <- list(...)$proxy + queryRes <- tryCatch({ + queryRequestPubChem(x, ..., query_only=query_only) + }, + error=function(e) { + cat('\r') + print(e) + list(Error=list( + Code='.queryPubChemSleep.ERROR', + Message='See Details for error msg', + Details=paste0(strip_style(e), collapse=' ') + )) + }) + # if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime) + return(queryRes) +} ## ============================ ## queryPubChem wrapper methods ## ---------------------------- -## These methods further specialize the queryPubChem function to provide -## a simple user interface that does not require knowledge of the PubChem -## REST API to use. - - -#' @title getPubChemFromNSC -#' -#' @description -#' Return a data.table mapping from ids to the information specified in `to`. -#' -#' @param ids A `character` or `numeric` vector of valid NSC ids to use for the -#' query. -#' @param to A `character(1)` vector with the desired return type. Currently -#' only 'cids' and 'sids' are implemented, but other options are available -#' via the PubChem API. This corresponds to the `operation` portion of the -#' PubChem API URL Path. -#' @param ... Fall through arguments to bpmapply. Use this to pass in BPPARAM -#' parameter to customize parellization settings. Alternatively, just call -#' `register()` with your desired parallel backend configuration. -#' @param raw A `logical(1)` vector specifying whether to early return the raw -#' query results. Use this if specifying an unimplemented return to the `to` -#' parameter. -#' @param proxy `logical(1)` Should the query be routed through a random -#' proxy server. This is useful to keep trying queries if a user gets -#' blacklisted. -#' -#' @return A `data.table` where the first column is the specified NSC ids and -#' the second column is the results specified in `to`. -#' -#' @md -#' @importFrom data.table data.table as.data.table setcolorder -#' @export -getPubChemFromNSC <- function(ids, to='cids', ..., batch=TRUE, raw=FALSE, - proxy=FALSE, options=NA, query_only=FALSE) { - funContext <- .funContext('::getPubChemFromNSC') - - # -- make the GET request - queryRes <- queryPubChem(ids, domain='substance', ..., - namespace='sourceid/DTP.NCI', operation=to, batch=batch, raw=raw, - proxy=proxy, operation_options=options, query_only=query_only) - - # -- early return option - if (isTRUE(raw) || isTRUE(query_only)) return(queryRes) - - # -- handle failed queries - failedQueries <- attributes(queryRes)$failed - queries <- attributes(queryRes)$queries - - # -- rehandle failed queries, somehow they are getting past queryPubChem? - ## FIXME:: How are they failed queries not found in queryPubChem? - failed <- unlist(lapply(queryRes, names)) %in% c("Fault", "Bad", "Error") - if (any(failed)) { - newFailedQueries <- Map(list, query=queries[failed], failure=queryRes[failed]) - failedQueries <- c(failedQueries, newFailedQueries) - queryRes <- queryRes[!failed] - queries <- queries[!failed] - } - - # -- process the results - .replace_NULL_NA <- function(DT) lapply(DT, function(x) { - ifelse(is.null(x), rep(NA_integer_, length(x)), x) }) - - # TODO:: Determine if all results are wrapped in two lists? If not this may - #>break the function. - .parseQueryToDT <- function(queryRes) as.data.table(queryRes[[1]][[1]]) - queryRes <- lapply(queryRes, FUN=.parseQueryToDT) - queryRes <- rbindlist(queryRes, fill=TRUE) - switch(to, - 'cids'={ - unlistQueryRes <- queryRes[, NSC_id := unlist(queries)][, - lapply(.SD, FUN=.replace_NULL_NA)][, lapply(.SD, unlist)] - if (nrow(unlistQueryRes) > nrow(queryRes)) - .warning(funContext, 'Some IDs multimap to returned CIDs, - check for sduplicates to see which ones!') - if (any(is.na(unlistQueryRes$CID))) .warning(funContext, 'Some IDs - failed to map and will have NA CIDs.') - }, - 'sids'={ - unlistQueryRes <- queryRes[, NSC_id := unlist(queries)][, - lapply(.SD, FUN=.replace_NULL_NA)][, lapply(.SD, unlist)] - if (nrow(unlistQueryRes) > nrow(queryRes)) - .warning(funContext, 'Some IDs multimap to returned SIDs, - check for duplicates to see which ones!') - if (any(is.na(unlistQueryRes$SID))) .warning(funContext, 'Some IDs - failed to map and will have NA SIDs.') - }, - .error('The operation ', to, ' has not been implemented yet!', - ' To return the unprocessed results of the query, set `raw=TRUE`.') - ) - # rearrange columns so that NSC_id is first - setcolorder(unlistQueryRes, rev(colnames(unlistQueryRes))) - if (length(failedQueries) > 0) { - .warning(funContext, 'One or more queries failed, please see - `attributes()$failed` for more information.') - attributes(unlistQueryRes)$failed <- failedQueries - } - return(unlistQueryRes) -} #' @title getPubChemCompound @@ -702,13 +591,12 @@ getAllPubChemAnnotations <- tryCatch({ # bpworkers(BPPARAM) <- 5 bpprogressbar(BPPARAM) <- TRUE - print(BPPARAM) + # print(BPPARAM) BiocParallel::register(BPPARAM) }, error=function(e) .warning(funContext, 'Failed to set parallelzation parameters! Please configure them yourself and pass in as the BPPARAM argument.')) pageList <- bplapply(seq(2, numPages), function(i, queryURL, numPages) { - t1 <- Sys.time() encodedURL <- URLencode(paste0(queryURL, '&page=', i)) queryRes <- RETRY('GET', encodedQueryURL, timeout(1), times=10, quiet=TRUE) @@ -722,8 +610,6 @@ getAllPubChemAnnotations <- data.table.') return(data.table()) }) - t2 <- Sys.time() - queryTime <- t2 - t1 # if (queryTime < 0.31) Sys.sleep(0.31 - queryTime) return(page) }, queryURL=queryURL, numPages=numPages) @@ -782,7 +668,8 @@ getPubChemAnnotation <- function( timeout_s = 29, retries = 3, quiet = TRUE, - throttleMessage = FALSE + throttleMessage = FALSE, + query_only = FALSE ){ header <- annotationType @@ -795,30 +682,35 @@ getPubChemAnnotation <- function( # TODO:: add a check to see if the compound is a valid CID or SID # TODO:: allow for variaitons of headers due to spelling errors - # Temporary: - if(header == "DILI") queryURL <- paste0(.buildURL(url, compound, 'JSON'), '?heading=', "Drug Induced Liver Injury") - else queryURL <- paste0(.buildURL(url, compound, 'JSON'), '?heading=', header) + if(header == "DILI") header <- "Drug Induced Liver Injury" + queryURL <- paste0(.buildURL(url, compound, 'JSON'), '?heading=', header) + if(query_only) return(URLencode(queryURL)) tryCatch({ result <- RETRY('GET', URLencode(queryURL), times = retries, quiet = quiet) }, error=function(e) { print(paste0("Error: ", e$message)) - return(NULL) + return(list(compound, "NA")) }) .checkThrottlingStatus(result, throttleMessage = throttleMessage) - result <- parseJSON(result) - result <- switch( - header, - 'ChEMBL ID' = .parseCHEMBLresponse(result), - 'NSC Number' = .parseNSCresponse(result), - 'DILI' = .parseDILIresponse(result), - 'CAS' = .parseCASresponse(result), - 'ATC Code' = .parseATCresponse(result)) - - if (is.null(result)) result <- list(compound, "N/A") - else result <- list(compound,result) + result <- parseJSON(result) + + if(is.null(result) || is.na(result) || is.null(result$Record)){ + result <- "NA" + } + else { + result <- switch(header, + 'ChEMBL ID' = .parseCHEMBLresponse(result), + 'NSC Number' = .parseNSCresponse(result), + 'DILI' = .parseDILIresponse(result), + 'Drug Induced Liver Injury' = .parseDILIresponse(result), + 'CAS' = .parseCASresponse(result), + 'ATC Code' = .parseATCresponse(result)) + } + + result <- list(compound, result) names(result) <- c("cid", header) return(result) } @@ -843,19 +735,25 @@ getPubChemAnnotation <- function( getPubChemAnnotations <- function(compound, annotations, ...){ result <- lapply(annotations, .getPubChemAnnotationDT, compound = compound, ...) names(result) <- annotations - Reduce(function(x, y) merge(x, y, by = "cid", all.x = TRUE), result) -} + # all values of result should be a 2 column data.table with "cid" and the + # annotation as the column names. + # merge all of them together on "cid" and keep all values even if NA or NULL + Reduce(function(x, y) merge(x, y, by = "cid", all = TRUE), result) + +} #' Function that returns a DT of getPubChemAnnotation results +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .getPubChemAnnotationDT <- function(compound, annotationType, ...){ result <- getPubChemAnnotation(compound, annotationType, ...) - data.table::as.data.table(result) + as.data.table(result, na.rm = F) } #' Function that parses the results of the PubChem PUG-VIEW API for the CHEMBL ID header +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .parseCHEMBLresponse <- function(result){ result <- result$Record$Reference$SourceID result <- gsub("Compound::", "", result) @@ -863,6 +761,7 @@ getPubChemAnnotations <- function(compound, annotations, ...){ } #' Function that parses the results of the PubChem PUG-VIEW API for the NSC Number header +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .parseNSCresponse <- function(result){ result <- result$Record$Reference$SourceID[1] result <- gsub(" ", "", result) @@ -870,22 +769,23 @@ getPubChemAnnotations <- function(compound, annotations, ...){ } #' Function that parses the results of the PubChem PUG-VIEW API for the DILI header +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .parseDILIresponse <- function(result){ + if(length(result$Record$Section) == 0){ - result <- "NA" + result <- "NA" }else{ + dt_ <- as.data.table(result$Record$Section) dt_ <- as.data.table(dt_)$Section[[1]] dt_ <- as.data.table(dt_)$Section dt_ <- as.data.table(dt_) dt_ <- as.data.table(dt_)$Information - # print(as.data.table(dt_)[1:3, .(Name,unlist(Value))]) section <- as.data.table(dt_)[1:3, "DILI" := paste0(unlist(Name), ":", unlist(Value))] # if any of the first 4 rows are NA, remove it section <- section[!is.na(section$DILI)] - section <- paste0(section[1:3, DILI], collapse= "; ") # create a list for each row as Name:Value string with no spaces and no new lines @@ -893,15 +793,18 @@ getPubChemAnnotations <- function(compound, annotations, ...){ result <- c(section, reference) result <- paste0(result, collapse = "; ") } + return(result) } #' Function that parses the results of the PubChem PUG-VIEW API for the CAS header +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .parseCASresponse <- function(result){ result <- result$Record$Reference$SourceID[1] return(result) } #' Function that parses the results of the PubChem PUG-VIEW API for the ATC Code header +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist .parseATCresponse <- function(result){ if(length(result$Record$Section) == 0){ result <- "NA" @@ -917,4 +820,104 @@ getPubChemAnnotations <- function(compound, annotations, ...){ }} return(result) -} \ No newline at end of file +} + + +## These methods further specialize the queryPubChem function to provide +## a simple user interface that does not require knowledge of the PubChem +## REST API to use. + + +#' @title getPubChemFromNSC +#' +#' @description +#' Return a data.table mapping from ids to the information specified in `to`. +#' +#' @param ids A `character` or `numeric` vector of valid NSC ids to use for the +#' query. +#' @param to A `character(1)` vector with the desired return type. Currently +#' only 'cids' and 'sids' are implemented, but other options are available +#' via the PubChem API. This corresponds to the `operation` portion of the +#' PubChem API URL Path. +#' @param ... Fall through arguments to bpmapply. Use this to pass in BPPARAM +#' parameter to customize parellization settings. Alternatively, just call +#' `register()` with your desired parallel backend configuration. +#' @param raw A `logical(1)` vector specifying whether to early return the raw +#' query results. Use this if specifying an unimplemented return to the `to` +#' parameter. +#' @param proxy `logical(1)` Should the query be routed through a random +#' proxy server. This is useful to keep trying queries if a user gets +#' blacklisted. +#' +#' @return A `data.table` where the first column is the specified NSC ids and +#' the second column is the results specified in `to`. +#' +#' @md +#' @importFrom data.table data.table as.data.table setcolorder +#' @export +getPubChemFromNSC <- function(ids, to='cids', ..., batch=TRUE, raw=FALSE, + proxy=FALSE, options=NA, query_only=FALSE) { + funContext <- .funContext('::getPubChemFromNSC') + + # -- make the GET request + queryRes <- queryPubChem(ids, domain='substance', ..., + namespace='sourceid/DTP.NCI', operation=to, batch=batch, raw=raw, + proxy=proxy, operation_options=options, query_only=query_only) + + # -- early return option + if (isTRUE(raw) || isTRUE(query_only)) return(queryRes) + + # -- handle failed queries + failedQueries <- attributes(queryRes)$failed + queries <- attributes(queryRes)$queries + + # -- rehandle failed queries, somehow they are getting past queryPubChem? + ## FIXME:: How are they failed queries not found in queryPubChem? + failed <- unlist(lapply(queryRes, names)) %in% c("Fault", "Bad", "Error") + if (any(failed)) { + newFailedQueries <- Map(list, query=queries[failed], failure=queryRes[failed]) + failedQueries <- c(failedQueries, newFailedQueries) + queryRes <- queryRes[!failed] + queries <- queries[!failed] + } + + # -- process the results + .replace_NULL_NA <- function(DT) lapply(DT, function(x) { + ifelse(is.null(x), rep(NA_integer_, length(x)), x) }) + + # TODO:: Determine if all results are wrapped in two lists? If not this may + #>break the function. + .parseQueryToDT <- function(queryRes) as.data.table(queryRes[[1]][[1]]) + queryRes <- lapply(queryRes, FUN=.parseQueryToDT) + queryRes <- rbindlist(queryRes, fill=TRUE) + switch(to, + 'cids'={ + unlistQueryRes <- queryRes[, NSC_id := unlist(queries)][, + lapply(.SD, FUN=.replace_NULL_NA)][, lapply(.SD, unlist)] + if (nrow(unlistQueryRes) > nrow(queryRes)) + .warning(funContext, 'Some IDs multimap to returned CIDs, + check for sduplicates to see which ones!') + if (any(is.na(unlistQueryRes$CID))) .warning(funContext, 'Some IDs + failed to map and will have NA CIDs.') + }, + 'sids'={ + unlistQueryRes <- queryRes[, NSC_id := unlist(queries)][, + lapply(.SD, FUN=.replace_NULL_NA)][, lapply(.SD, unlist)] + if (nrow(unlistQueryRes) > nrow(queryRes)) + .warning(funContext, 'Some IDs multimap to returned SIDs, + check for duplicates to see which ones!') + if (any(is.na(unlistQueryRes$SID))) .warning(funContext, 'Some IDs + failed to map and will have NA SIDs.') + }, + .error('The operation ', to, ' has not been implemented yet!', + ' To return the unprocessed results of the query, set `raw=TRUE`.') + ) + # rearrange columns so that NSC_id is first + setcolorder(unlistQueryRes, rev(colnames(unlistQueryRes))) + if (length(failedQueries) > 0) { + .warning(funContext, 'One or more queries failed, please see + `attributes()$failed` for more information.') + attributes(unlistQueryRes)$failed <- failedQueries + } + return(unlistQueryRes) +} diff --git a/R/parseJSON.R b/R/parseJSON.R index 59eec08..be8f95c 100644 --- a/R/parseJSON.R +++ b/R/parseJSON.R @@ -1,3 +1,5 @@ + + #' Parse a JSON into a list #' #' @param response A `response` object as returned by `httr::GET` @@ -10,14 +12,20 @@ #' @md #' @importFrom jsonlite fromJSON #' @importFrom httr content -#' @export +#' @export parseJSON <- function(response, ..., encoding='UTF-8', query_only=FALSE) { if (isTRUE(query_only)) return(response) + response <- content(response, encoding = "UTF-8", as='text', type='JSON') + + # if (is.null(response)) return(NA) + # if (is.na(response)) return(NA) + + if (is.null(response) | is.na(response)) return(NULL) + tryCatch({ - fromJSON(content(response, ..., as='text', type='JSON', - encoding=encoding)) + fromJSON(response, ...) }, error=function(e) { - fromJSON(content(response, ..., type='JSON', encoding=encoding)) + NA }) -} \ No newline at end of file +} diff --git a/build/annotationGx.Dockerfile b/build/annotationGx.Dockerfile index 8d2e1a6..b4f3730 100644 --- a/build/annotationGx.Dockerfile +++ b/build/annotationGx.Dockerfile @@ -3,8 +3,8 @@ FROM bioconductor/bioconductor_docker:3.17-R-4.3.0 # Install required libraries -- using prebuild binaries where available -# RUN apt-get update && apt-get install -y \ -# git \ +RUN apt-get update && apt-get install -y \ + git # r-cran-data.table \ # r-cran-doparallel \ # r-cran-dygraphs \ @@ -30,6 +30,12 @@ FROM bioconductor/bioconductor_docker:3.17-R-4.3.0 # Install R packages # RUN install2.r --error --deps TRUE BiocManager +# install data.table +# RUN R -e "BiocManager::install('data.table', update = FALSE, ask = FALSE)" + +RUN install2.r --error --deps TRUE pak + +RUN R -e "pak::pkg_install('bhklab/AnnotationGx@main', ask = F)" RUN rm -rf /tmp/* diff --git a/man/parseJSON.Rd b/man/parseJSON.Rd index 82a38c4..b967d03 100644 --- a/man/parseJSON.Rd +++ b/man/parseJSON.Rd @@ -1,11 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getPubChem.R, R/parseJSON.R +% Please edit documentation in R/parseJSON.R \name{parseJSON} \alias{parseJSON} \title{Parse a JSON into a list} \usage{ -parseJSON(response, ..., encoding = "UTF-8", query_only = FALSE) - parseJSON(response, ..., encoding = "UTF-8", query_only = FALSE) } \arguments{ @@ -16,12 +14,8 @@ parseJSON(response, ..., encoding = "UTF-8", query_only = FALSE) \item{as}{A \code{character} vector indicating the return type. Options are 'raw',} } \description{ -Parse a JSON into a list - Parse a JSON into a list } \seealso{ -\link[httr:content]{httr::content} - \link[httr:content]{httr::content} }