diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 700bbed..0b4aa59 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,7 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, jermiah] pull_request: branches: [main, master] @@ -31,18 +31,18 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - - uses: r-lib/actions/setup-r-dependencies@v1 + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: rcmdcheck - - uses: r-lib/actions/check-r-package@v1 + - uses: r-lib/actions/check-r-package@v2 with: error-on: '"error"' \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index bce9f5c..4a578fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,6 +5,8 @@ Version: 0.0.5.9001 Authors@R: c( person("Christopher", "Eeles", role = c("aut"), email = "christopher.eeles@uhnresearch.ca"), + person("Jermiah", "Joseph", role = c("aut"), + email = "jermiah.joseph@uhn.ca"), person("Sisira", "Nair", role = c("aut"), email="sisira.nair@uhnresearch.ca"), person("Petr", "Smirnov", role=c("aut")), @@ -30,7 +32,6 @@ Imports: qs, checkmate, rlang, - CoreGx, crayon, memoise, R6, diff --git a/NAMESPACE b/NAMESPACE index 2f6d128..14e2b30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(.buildURL) +export(.createQueryURLs) export(.groupListByName) export(addAnnotationColumnToDrugs) export(characterToNamedVector) @@ -13,9 +14,10 @@ export(downloadAndExtract) export(downloadFDAOrangeBook) export(find_remote_files_recursive) export(fsquash) +export(getAllPubChemAnnotations) export(getBrainArrayTable) -export(getCellApi) export(getCellosaurus) +export(getCellosaurusAPI) export(getCellosaurusDataFrame) export(getCelloxml) export(getChemblAllMechanisms) @@ -31,7 +33,7 @@ export(getGencodeFilesTable) export(getGencodeGRangesAnnotated) export(getGuideToPharm) export(getInfoFromCelllineInput) -export(getPubChemAnnotations) +export(getPubChemAnnotation) export(getPubChemCompound) export(getPubChemFromNSC) export(getPubChemSubstance) @@ -55,7 +57,6 @@ export(scrapeRemoteFTPFilesTable) export(zenodoMetadata) import(R6) import(checkmate) -import(httr) importFrom(BiocParallel,"bplog<-") importFrom(BiocParallel,"bpprogressbar<-") importFrom(BiocParallel,"bpworkers<-") @@ -66,8 +67,6 @@ importFrom(BiocParallel,bpparam) importFrom(BiocParallel,bpprogressbar) importFrom(BiocParallel,bptry) importFrom(BiocParallel,bpworkers) -importFrom(CoreGx,.errorMsg) -importFrom(CoreGx,.warnMsg) importFrom(R6P,Singleton) importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,mcols) diff --git a/R/getCellosaurus.R b/R/getCellosaurus.R index eab15e1..981149b 100644 --- a/R/getCellosaurus.R +++ b/R/getCellosaurus.R @@ -19,54 +19,58 @@ #' @md #' @importFrom xml2 read_xml #' @export -getCelloxml <- - memoise::memoise(function(url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml", verbose = TRUE) { - if (verbose) { - message(paste( - "xml read started from", - url, - format(Sys.time(), "%Y-%m-%d %H:%M:%S") - )) - } - main_xml <- read_xml(url) - if (verbose) { - message(paste( - "xml read completed at", - format(Sys.time(), "%Y-%m-%d %H:%M:%S") - )) - } - return(main_xml) - }) - -#' @md -#' @importFrom xml2 read_xml xml_find_all -#' @export -#########ADD DOCS -cleanCellnames <- - function(main_xml, verbose = TRUE) { +getCelloxml <- memoise::memoise(function(url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml", verbose = TRUE) { if (verbose) { message(paste( - "Started removing special characters from cell line names in the xml", + "xml read started from", + url, format(Sys.time(), "%Y-%m-%d %H:%M:%S") )) } - matching <- xml_find_all(main_xml, "//cell-line/name-list/name/text()") - # A raw string, added in R 4.0 will excape characters for you: r"{ }" - badchars <- r"{[\xb5]|[]|[ ,]|[;]|[:]|[-]|[+]|[*]|[%]|[$]|[#]|[{]|[}]|[[]|[]]|[|]|[^]|[/]|[\]|[.]|[_]|[ ]|[(]|[)]}" - for(i in 1:length(matching)){ - node1 <- matching[[i]] - node1text <- xml_text(node1) - xml_par <- xml_find_first(node1, "parent::*") - xml_set_attr(xml_par, "cleanname", gsub(badchars,"",ignore.case = TRUE, node1text)) - } + main_xml <- read_xml(url) if (verbose) { message(paste( - "Removed special characters from cell line names in the xml", + "xml read completed at", format(Sys.time(), "%Y-%m-%d %H:%M:%S") )) } return(main_xml) } +) + +#' Clean Cell Names +#' +#' @description +#' TODO:: +#' +#' @md +#' @importFrom xml2 read_xml xml_find_all +#' @export +#########ADD DOCS +cleanCellnames <- function(main_xml, verbose = TRUE) { + if (verbose) { + message(paste( + "Started removing special characters from cell line names in the xml", + format(Sys.time(), "%Y-%m-%d %H:%M:%S") + )) + } + matching <- xml_find_all(main_xml, "//cell-line/name-list/name/text()") + # A raw string, added in R 4.0 will excape characters for you: r"{ }" + badchars <- r"{[\xb5]|[]|[ ,]|[;]|[:]|[-]|[+]|[*]|[%]|[$]|[#]|[{]|[}]|[[]|[]]|[|]|[^]|[/]|[\]|[.]|[_]|[ ]|[(]|[)]}" + for(i in 1:length(matching)){ + node1 <- matching[[i]] + node1text <- xml_text(node1) + xml_par <- xml_find_first(node1, "parent::*") + xml_set_attr(xml_par, "cleanname", gsub(badchars,"",ignore.case = TRUE, node1text)) + } + if (verbose) { + message(paste( + "Removed special characters from cell line names in the xml", + format(Sys.time(), "%Y-%m-%d %H:%M:%S") + )) + } + return(main_xml) +} #' Filter parent node cell-line and parse child nodes for required annotations #' @param cell_ip is either cell name or cvcl id. diff --git a/R/getCellosaurusAPI.R b/R/getCellosaurusAPI.R new file mode 100644 index 0000000..cdb6a2b --- /dev/null +++ b/R/getCellosaurusAPI.R @@ -0,0 +1,164 @@ + +#' Create a list of query URLS for Cellosaurus API +#' +#' @description +#' This function creates a queryURL for the cellosaurus API using a list of cell line names +#' +#' @details +#' Function to create a URL query for Cellosaurus to search for a cell-line using its name +#' An example call: computedURLs <- .createQueryURLs(api = "https://api.cellosaurus.org/", cl_names = c("22rv1", "Hela"), fields = c("id", "ac")) +#' @return A list of URLS +#' @param api is the link to the API to build the URL. i.e "https://api.cellosaurus.org/" +#' @param cl_names is a list of the cell line names +#' @param format is the type of format to return from the API. Can be "txt" or "json" +#' @param num_results is the number of of items to return, DEFAULT=1 +#' @param GETfxn is the function to use on the cellosaurus website. Currently only supports "search/cell-line?" +#' @param fields is a list of desired fields to include in the response +#' +#' @md +#' @export +.createQueryURLs <- + function(api = "https://api.cellosaurus.org/", + cl_names, + format = "txt", + num_results = 1, + GETfxn = c("search/cell-line?", "cell-line/"), + fields, + q = "idsy:") { + + if (GETfxn == "search/cell-line?") { + # create urls + computedURLs <- paste0( + api, + GETfxn, + "q=", q, + gsub(" ", "%20",cl_names), + "&rows=", num_results, + "&format=", format, + "&fields=", paste(fields, collapse=",") + ) + return(computedURLs) + } else if (GETfxn == "cell-line/") { + computedURLs <- paste0( + api, + GETfxn, + gsub(" ", "%20",cl_names), + "?", + "format=",format, + "&fields=", paste(fields, collapse=",") + ) + return(computedURLs) + } else { + stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'") + } + + return(computedURLs) +} + +#' Query Cellosaurus +#' +#' @description +#' This function takes a list of cell line names and interested fields and gets responses from the Cellosaurus API +#' +#' @details +#' Function to get responses from Cellosaurus API +#' +#' @return A list of responses +#' @param cl_names is a list of the cell line names +#' @param fields is a list of desired fields to obtain for each cell line in the API query, i.e if only trying to get synonynms and primary accesssion then fields=c("sy", "ac"). see https://api.cellosaurus.org/static/fields_help.html for all fields. +#' +#' @md +#' @export +#' +getCellosaurusAPI <- + function( + cl_names, # List of cell line names + fields = c( + "id", # Recommended name. Most frequently the name of the cell line as provided in the original publication. + "ac", # Primary accession. It is the unique identifier of the cell line. It is normally stable across Cellosaurus versions ... + "sy", # List of synonyms. + "misspelling", # Identified misspelling(s) of the cell line name + "din", # Disease(s) suffered by the individual from which the cell line originated with its NCI Thesaurus or ORDO identifier. + "ca", # Category to which a cell line belongs, one of 14 defined terms. Example: cancer cell line, hybridoma, transformed cell line. + "sx", # Sex + "ag", # Age at sampling time of the individual from which the cell line was established. + "sampling-site", # Body part, organ, cell-type the cell line is derived from + "metastatic-site" # Body part, organ the cell line is derived from in the case of a cell line originating from a cancer metastasis. + ), + GETfxn = c("search/cell-line?", "cell-line/"), # Function to use on the cellosaurus website + querydomain = "ac:" + ){ + cellosaurus_api_url <- "https://api.cellosaurus.org/" + + computedURLs <- .createQueryURLs(api = cellosaurus_api_url, GETfxn = GETfxn, cl_names = cl_names, fields = fields, q = querydomain) + + responseList <- BiocParallel::bplapply(computedURLs, function(x) GET(x)) + names(responseList) <- cl_names + return(responseList) +} + +#' Clean cellosaurus responses +#' +#' @description +#' This function takes a list of Cellosaurus Responses and cleans them for use +#' +#' @details This function takes a list of Cellosaurus Responses and cleans them for use +#' @return A list of responses +#' @param responseList is a list of responses +#' +#' @md +#' @export +#' +cleanCellosaurusResponse <- + function( + responseList, + GETfxn = c("search/cell-line?", "cell-line/") + ){ + # Get content of each response, then separate content on newline character + responseContent <- lapply(lapply(responseList, httr::content), + function(x) strsplit(x=x, split="\n")) + + if (GETfxn == "search/cell-line?") { + #Remove first 15 rows of content + responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)]) + # Split on first " " appearance + responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x, split = " .")) + # rbind responses (do.call returns as one large matrix instead of dataframe) + df_ <- lapply(responseContent_sub_split, function(x) do.call(rbind, x)) + + # convert each response from matrix to data.table + df_2 <- lapply(df_, function(x) data.table(x)) + + # rbinds all responses (creates new column so all of cell line x will have its name in col cellLine) + df_3<- rbindlist(df_2, idcol = "cellLine") + df_3 <- df_3[V1!="//"] + + # Collapse all rows with the same cellLine & V1 (most often rows for cc) and separate by "; " + df_4 <- df_3[, list(data = paste0(unique(na.omit(V2)), collapse ="; ")), by = c("cellLine", "V1")] + + # transpose + df_5 <- data.table::dcast(df_4, cellLine ~ ...) + + return(df_5) + } else if (GETfxn == "cell-line/") { + #Remove first 15 rows of content + # responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)]) + responseContent_sub <- responseContent + responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x[[1]], split = " .")) + + result <- rbindlist(lapply(responseContent_sub_split, function(x) { + # remove the entire column if any of the elements has "//" in it + if (any(grepl("//", x))) { + x <- x[, -1] + } + + cvcl_dt <- as.data.table(x) + names(cvcl_dt) <- as.character(cvcl_dt[1]) + cvcl_dt[2] + })) + return(result) + + } else { + stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'") + } +} \ No newline at end of file diff --git a/R/getDrugBank.R b/R/getDrugBank.R index b42a273..d29b900 100644 --- a/R/getDrugBank.R +++ b/R/getDrugBank.R @@ -188,12 +188,12 @@ listColToDT <- function(col) { } # Function testing scripts, not run unless this file is called as a script -if (sys.nframe() == 0) { - library(AnnotationGx) - library(xml2) - library(data.table) - library(BiocParallel) +# if (sys.nframe() == 0) { +# library(AnnotationGx) +# library(xml2) +# library(data.table) +# library(BiocParallel) - drugTargetDT <- getDrugTargets('local_data/drugbank.xml') +# drugTargetDT <- getDrugTargets('local_data/drugbank.xml') -} \ No newline at end of file +# } \ No newline at end of file diff --git a/R/getPubChem-helpers.R b/R/getPubChem-helpers.R new file mode 100644 index 0000000..ac514c1 --- /dev/null +++ b/R/getPubChem-helpers.R @@ -0,0 +1,170 @@ +#' Checks to see if the PubChem query is exceeding the throttling limit +#' @param response `httr::response` +#' @param throttleMessage `logical` whether to print the throttling message +#' @return `logical` whether the query is throttled +.checkThrottlingStatus <- function(result, throttleMessage = FALSE){ + message <- headers(result)$`x-throttling-control` + + if (throttleMessage == TRUE){ + message(message) + } + matches <- regmatches(message, gregexpr("\\((.*?)%\\)", message)) # Extracts text within parentheses + percentages <- gsub("\\(|%|\\)", "", unlist(matches[1:3])) + # print(percentages) + percentage <- max(as.numeric(percentages)) + if(as.integer(percentage) > 15 && as.integer(percentage) < 30){ + Sys.sleep(15) + }else if (as.integer(percentage) > 30 && as.integer(percentage) < 50){ + Sys.sleep(20) + }else if (as.integer(percentage) > 50 && as.integer(percentage) < 75) { + print(paste0("Throttling at ", percentage, "%. Sleeping for 30 seconds.")) + Sys.sleep(30) + }else if (as.integer(percentage) > 75) { + print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds.")) + Sys.sleep(60) + }else{ + Sys.sleep(5) + } + + return(as.integer(percentage) > 15) +} + +## TODO:: Retrieve PubChem server status to dynamically set query spacing +##>based on server load +## TODO:: Make the query away for server load status in response header +#' @importFrom crayon strip_style +.queryPubChemSleep <- function(x, ..., query_only=FALSE) { + proxy <- list(...)$proxy + t1 <- Sys.time() + queryRes <- tryCatch({ + queryRequestPubChem(x, ..., query_only=query_only) + }, + error=function(e) { + cat('\r') + print(e) + list(Error=list( + Code='.queryPubChemSleep.ERROR', + Message='See Details for error msg', + Details=paste0(strip_style(e), collapse=' ') + )) + }) + t2 <- Sys.time() + queryTime <- t2 - t1 + if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime) + return(queryRes) +} + +# ----------------------------- +# getPubChemAnnotations Helpers + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseATCannotations <- function(DT) { + dataL <- DT$Data + names(dataL) <- DT$SourceID + dataDT <- rbindlist(dataL, fill=TRUE, use.names=TRUE, idcol='SourceID') + dataDT[, ATC_code := unlist(lapply(Value.StringWithMarkup, + function(x) last(x)[[1]]))] + annotationDT <- merge.data.table( + dataDT[, .(SourceID, ATC_code)], + DT[, .(SourceName, SourceID, LinkedRecords)], + by='SourceID', + allow.cartesian=TRUE + ) + DT <- annotationDT[, .(CID=unlist(LinkedRecords)), + by=.(SourceName, SourceID, ATC_code)] + DT <- unique(DT) + return(DT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseDILIannotations <- function(DT) { + dataL <- DT$Data + names(dataL) <- DT$SourceID + dataL <- lapply(dataL, FUN=`[`, i=Name %like% 'DILI') + dataDT <- rbindlist(dataL, fill=TRUE, use.names=TRUE, idcol='SourceID') + dataDT[, DILI := unlist(Value.StringWithMarkup)] + annotationDT <- merge.data.table( + dataDT[, .(SourceID, DILI)], + DT[, .(SourceID, SourceName, Name, LinkedRecords.CID, + LinkedRecords.SID)], + by='SourceID', + allow.cartesian=TRUE) + DT <- + annotationDT[, + .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), + by=.(SourceName, SourceID, Name, DILI)] + return(DT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseNSCannotations <- function(DT) { + DT[, NSC := unlist(lapply(Data, `[[`, i=4))] + annotationDT <- DT[, + .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), + by=.(SourceName, SourceID, NSC)] + return(annotationDT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseCTDannotations <- function(DT) { + annotationDT <- DT[, .(CID=unlist(LinkedRecords)), + by=.(SourceName, SourceID, URL)] + annotationDT[, CTD := gsub('::.*$', '', SourceID)] + return(annotationDT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames +.parseCASannotations <- function(list) { + # Make sure CIDs all go in the same column + CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords', + skip_absent=TRUE) + DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE) + DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))] + CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)] + ID_DT <- DT[, + .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))), + by=.(SourceName, SourceID, Name, URL)] + annotationDT <- merge.data.table( + CAS_DT, ID_DT, + by=c('SourceName', 'SourceID', 'Name'), + all.x=TRUE, + allow.cartesian=TRUE) + annotationDT <- unique(annotationDT) + return(annotationDT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseSynonymsAndIdentifiers <- function(DT) { + dataList <- lapply(DT$Data, as.data.table) + names(dataList) <- DT$SourceID + dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE, + idcol='SourceID') + DT[, Data := NULL] + dataDT[, + Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'), + by=SourceID] + dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID] + dataDT[, Value.StringWithMarkup := NULL] + annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE) + setnames(annotationDT, + old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'), + new=c('Type', 'Heading', 'ID') + ) + annotationDT <- unique(annotationDT) + return(annotationDT) +} + +#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist +.parseNamesAndSynonyms <- function(DT) { + DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])] + # Remove the weird annotation from the end of the synonym + DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')] + DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))] + # fix NULL list itemss + DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)] + DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)] + annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)), + by=.(SourceName, SourceID, Name, URL, Synonyms)] + annotationDT <- unique(annotationDT) + return(annotationDT) +} diff --git a/R/getPubChem.R b/R/getPubChem.R index 218fbb2..1e71500 100644 --- a/R/getPubChem.R +++ b/R/getPubChem.R @@ -144,7 +144,7 @@ #' @export getRequestPubChem <- function(id, domain='compound', namespace='cid', operation=NA, output='JSON', ..., url='https://pubchem.ncbi.nlm.nih.gov/rest/pug', - operation_options=NA, proxy=FALSE, raw=FALSE, query_only=FALSE) { + operation_options=NA, proxy=FALSE, raw=FALSE, query_only=FALSE, verbose = FALSE) { funContext <- .funContext('::getRequestPubChem') # handle list or vector inputs for id @@ -164,34 +164,13 @@ getRequestPubChem <- function(id, domain='compound', namespace='cid', operation= # get HTTP response, respecting the 30s max query time of PubChem API tryCatch({ - if (isTRUE(proxy)) { - if (is.null(proxyManager$get_proxies())) { - tryCatch(proxyManager$connect(), - error=function(e) - stop("proxyManager connection failed, set proxy=FALSE!\n\t", e) - ) - } - result <- FALSE - count <- 1 - while(isFALSE(result)) { - proxy <- unlist(proxyDT[sample(.N, 1), ]) - result <- tryCatch({ - RETRY('GET', encodedQuery, timeout(29), times=3, quiet=TRUE, - terminate_on=c(400, 404, 503), - use_proxy(proxy[1], port=as.integer(proxy[2]))) - }, error=function(e) FALSE) - count <- count + 1 - if (count > 10) .error(funContext, 'Infinite retry loop - due to failed proxy requests!') - } - } else { - result <- RETRY('GET', encodedQuery, timeout(29), times=3, - quiet=TRUE, terminate_on=c(400, 404, 503)) - } + + result <- RETRY('GET', encodedQuery, timeout(29), times=3, + quiet=TRUE, terminate_on=c(400, 404, 503)) if (isTRUE(raw)) return(result) - .checkThrottlingStatus(result) + .checkThrottlingStatus(result, throttleMessage = verbose) canParse <- tryCatch({ parseJSON(result, as='text'); TRUE }, error=function(e) FALSE) @@ -217,29 +196,6 @@ getRequestPubChem <- function(id, domain='compound', namespace='cid', operation= }) } -#' Checks to see if the PubChem query is exceeding the throttling limit -#' @param response `httr::response` -.checkThrottlingStatus <- function(response) { - throttling_control <- headers(response)$`x-throttling-control` - any_grepl <- function(...) any(grepl(...)) - throttling_state <- max(which(vapply( - c('Green', 'Yellow', 'Red', 'Black', 'blacklisted'), - FUN=any_grepl, x=throttling_control, FUN.VALUE=logical(1)))) - if (throttling_state == 2) { - .warning('PubChem Server returned Yellow status! Sleeping to compensate.') - Sys.sleep(5) - } else if (throttling_state == 3) { - .warning('PubChem Server returend Red status! Sleeping to compensate.') - Sys.sleep(10) - } else if (throttling_state == 4) { - .error('PubChem Server returned Black status! You could be ', - 'black listed. The returned state message is: ', - throttling_control, '.') - } else if (throttling_state == 5) { - .error('PubChem server indicated: too many queries per second', - ' or you may be blacklisted.') - } -} #' @title queryPubChem #' @@ -259,7 +215,7 @@ getRequestPubChem <- function(id, domain='compound', namespace='cid', operation= queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, output='JSON', ..., url='https://pubchem.ncbi.nlm.nih.gov/rest/pug', operation_options=NA, batch=TRUE, raw=FALSE, proxy=FALSE, - query_only=FALSE) { + query_only=FALSE, verbose = FALSE) { if (!is.character(id)) id <- as.character(id) if (namespace %in% c('name', 'xref', 'smiles', 'inchi', 'sdf')) batch <- FALSE @@ -290,12 +246,12 @@ queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, queryRes <- bplapply(queries, FUN=.queryPubChemSleep, domain=domain, namespace=namespace, operation=operation, output=output, url=url, operation_options=operation_options, BPPARAM=BPPARAM, proxy=proxy, - raw=raw, query_only=query_only) + raw=raw, query_only=query_only, verbose = verbose) } else { queryRes <- bplapply(id, FUN=.queryPubChemSleep, domain=domain, namespace=namespace, operation=operation, output=output, url=url, operation_options=operation_options, BPPARAM=BPPARAM, proxy=proxy, - raw=raw, query_only=query_only) + raw=raw, query_only=query_only, verbose = verbose) queries <- as.list(id) } @@ -318,30 +274,6 @@ queryPubChem <- function(id, domain='compound', namespace='cid', operation=NA, return(queryRes) } -## TODO:: Retrieve PubChem server status to dynamically set query spacing -##>based on server load -## TODO:: Make the query away for server load status in response header -#' @importFrom crayon strip_style -.queryPubChemSleep <- function(x, ..., query_only=FALSE) { - proxy <- list(...)$proxy - t1 <- Sys.time() - queryRes <- tryCatch({ - queryRequestPubChem(x, ..., query_only=query_only) - }, - error=function(e) { - cat('\r') - print(e) - list(Error=list( - Code='.queryPubChemSleep.ERROR', - Message='See Details for error msg', - Details=paste0(strip_style(e), collapse=' ') - )) - }) - t2 <- Sys.time() - queryTime <- t2 - t1 - if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime) - return(queryRes) -} #' Parse a JSON into a list #' @@ -539,8 +471,8 @@ getPubChemCompound <- function(ids, from='cid', to='property', ..., batch <- FALSE } - if (to == 'property') - to <- paste0(to, '/', paste0(properties, collapse=',')) + if (to == 'property') to <- paste0(to, '/', paste0(properties, collapse=',')) + queryRes <- queryPubChem(ids, domain='compound', namespace=from, operation=to, batch=batch, raw=raw, proxy=proxy, operation_options=options, query_only=query_only, ...) @@ -644,6 +576,8 @@ getPubChemSubstance <- function(ids, from='cid', to='sids', ..., return(queryRes) } + + #' Get a selected annotation for all PubChem entries #' #' @description @@ -684,15 +618,32 @@ getPubChemSubstance <- function(ids, from='cid', to='sids', ..., #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist fwrite #' @importFrom BiocParallel bpparam bpworkers bpprogressbar bptry #' @export -getPubChemAnnotations <- function(header='Available', type='Compound', - parseFUN=identity, ..., output='JSON', raw=FALSE, +getAllPubChemAnnotations <- + function( + header='Available', + type='Compound', + parseFUN=identity, + output='JSON', + raw=FALSE, + rawAnnotationDT=FALSE, + verbose = FALSE, url='https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading', - BPPARAM=bpparam(), proxy=FALSE) { + BPPARAM=bpparam(), + proxy=FALSE, + retries=3, + maxPages=NA, + ... + ) { funContext <- .funContext('::getPubChemAnnotations') if (header == 'Available') { queryURL <- 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/annotations/headings/JSON' - } else { + } else if (header == 'data') { + url <- 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/' + queryURL <- paste0(.buildURL(url, header, output), + '?heading_type=', type) + } + else { queryURL <- paste0(.buildURL(url, header, output), '?heading_type=', type) } @@ -704,7 +655,7 @@ getPubChemAnnotations <- function(header='Available', type='Compound', count <- 1 while(isFALSE(queryRes)) { proxy <- unlist(proxyDT[sample(.N, 1), ]) - queryRes <- tryCatch({ RETRY('GET', encodedQueryURL, timeout(29), times=3, + queryRes <- tryCatch({ RETRY('GET', encodedQueryURL, timeout(29), times=retries, quiet=TRUE, use_proxy(proxy[1], as.integer(proxy[2]))) }, error=function(e) FALSE) @@ -717,50 +668,42 @@ getPubChemAnnotations <- function(header='Available', type='Compound', } fwrite(proxyDT, file=file.path(tempdir(), 'proxy.csv')) } else { - queryRes <- RETRY('GET', encodedQueryURL, timeout(29), times=3, + queryRes <- RETRY('GET', encodedQueryURL, timeout(29), times=retries, quiet=TRUE) } - + .checkThrottlingStatus(queryRes) if (isTRUE(raw)) return(queryRes) resultDT <- as.data.table(parseJSON(queryRes)[[1]][[1]]) if (header == 'Available') return(resultDT) - numPages <- as.numeric(content(queryRes)[[1]]$TotalPages) + numPages <- content(queryRes)[[1]]$TotalPages + if (is.null(numPages)) { + numPages <- 1 + }else { + if (!is.na(maxPages)) numPages <- min(as.numeric(maxPages),as.numeric(numPages)) + if (verbose) print(paste0("numPages: ", numPages)) + } + + if (numPages > 1) { tryCatch({ - bpworkers(BPPARAM) <- 5 + # bpworkers(BPPARAM) <- 5 bpprogressbar(BPPARAM) <- TRUE + print(BPPARAM) + BiocParallel::register(BPPARAM) }, error=function(e) .warning(funContext, 'Failed to set parallelzation parameters! Please configure them yourself and pass in as the BPPARAM argument.')) pageList <- bplapply(seq(2, numPages), function(i, queryURL, numPages) { t1 <- Sys.time() encodedURL <- URLencode(paste0(queryURL, '&page=', i)) - if (isTRUE(proxy)) { - proxyDT <- fread(file.path(tempdir(), 'proxy.csv')) - queryRes <- FALSE - count <- 1 - while(isFALSE(queryRes)) { - proxy <- unlist(proxyDT[sample(.N, 1), ]) - print(proxy) - queryRes <- tryCatch({ RETRY('GET', encodedQueryURL, timeout(29), times=3, - quiet=TRUE, use_proxy(proxy[1], as.integer(proxy[2]))) - }, error=function(e) FALSE) - - if (isFALSE(queryRes)) { - proxyDT <- proxyDT[ip != proxy[1] & port != proxy[2], ] - } - count <- count + 1 - if (count > 10) .error(funContext, 'Infinite retry loop - due to failed proxy requests!') - } - fwrite(proxyDT, file=file.path(tempdir(), 'proxy.csv')) - } else { - queryRes <- RETRY('GET', encodedQueryURL, timeout(29), times=3, + queryRes <- RETRY('GET', encodedQueryURL, timeout(1), times=10, quiet=TRUE) - } + + .checkThrottlingStatus(queryRes) + page <- tryCatch({ as.data.table(parseJSON(queryRes)[[1]][[1]]) }, error=function(e) { @@ -770,9 +713,9 @@ getPubChemAnnotations <- function(header='Available', type='Compound', }) t2 <- Sys.time() queryTime <- t2 - t1 - if (queryTime < 0.31) Sys.sleep(0.31 - queryTime) + # if (queryTime < 0.31) Sys.sleep(0.31 - queryTime) return(page) - }, BPPARAM=BPPARAM, queryURL=queryURL, numPages=numPages) + }, queryURL=queryURL, numPages=numPages) pageList <- c(list(resultDT), pageList) } else { pageList <- list(resultDT) @@ -780,9 +723,15 @@ getPubChemAnnotations <- function(header='Available', type='Compound', if (header != 'CAS') { annotationDT <- rbindlist(pageList, fill=TRUE, use.names=TRUE) + if (verbose) print("applying as.data.table to Data column of annotationDT") annotationDT[, Data := lapply(Data, as.data.table)] + } else { + annotationDT <- pageList + } + if (isTRUE(rawAnnotationDT)) { + if(verbose) print(paste0("Not Parsing, ", header, " returning annotationDT")) + return(annotationDT) } - # parse the results to a user friendly format switch(header, 'ATC Code'=return(.parseATCannotations(annotationDT)), @@ -791,7 +740,7 @@ getPubChemAnnotations <- function(header='Available', type='Compound', 'CTD Chemical-Gene Interactions'=return(.parseCTDannotations(annotationDT)), 'Names and Synonyms'=return(.parseNamesAndSynonyms(annotationDT)), 'Synonyms and Identifiers'=return(.parseSynonymsAndIdentifiers(annotationDT)), - 'CAS'=return(.parseCASannotations(pageList)), + 'CAS'=return(.parseCASannotations(annotationDT)), tryCatch({ parseFUN(annotationDT) }, @@ -803,107 +752,152 @@ getPubChemAnnotations <- function(header='Available', type='Compound', }) ) } +#' @title getPubChemAnnotation +#' @description queries the PubChem PUG-VIEW API to get a single annotation using a CID for a header +#' +#' @param compound `character(1)` A valid CID to use for the query. +#' @param header `character(1)` A valid header name for the PUG VIEW annotations +#' @param url `character(1)` The URL to perform API queries on. default = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound' +#' @param output `character(1)` The output format. Defaults to 'JSON'. +#' @param timeout_s `numeric(1)` The number of seconds to wait before timing out. Default is 29. +#' @param retries `numeric(1)` The number of times to retry a failed query. Default is 3. +#' @param quiet `logical(1)` Should the function be quiet? Default is TRUE. +#' @param throttleMessage `logical(1)` Should a message be printed when the query is throttled? Default is FALSE. +#' @export +getPubChemAnnotation <- function( + compound, + header = 'ChEMBL ID', + url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound', + output = 'JSON', + timeout_s = 29, + retries = 3, + quiet = TRUE, + throttleMessage = FALSE + ){ + + # TODO:: add a check to see if the compound is a valid CID or SID + # TODO:: allow for variaitons of headers due to spelling errors + # Temporary: + if(header == "DILI") queryURL <- paste0(.buildURL(url, compound, output), '?heading=', "Drug Induced Liver Injury") + else queryURL <- paste0(.buildURL(url, compound, output), '?heading=', header) -# ----------------------------- -# getPubChemAnnotations Helpers - -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseATCannotations <- function(DT) { - dataL <- DT$Data - names(dataL) <- DT$SourceID - dataDT <- rbindlist(dataL, fill=TRUE, use.names=TRUE, idcol='SourceID') - dataDT[, ATC_code := unlist(lapply(Value.StringWithMarkup, - function(x) last(x)[[1]]))] - annotationDT <- merge.data.table( - dataDT[, .(SourceID, ATC_code)], - DT[, .(SourceName, SourceID, LinkedRecords)], - by='SourceID' - ) - DT <- annotationDT[, .(CID=unlist(LinkedRecords)), - by=.(SourceName, SourceID, ATC_code)] - return(DT) -} + tryCatch({ + result <- RETRY('GET', URLencode(queryURL), times = retries, quiet = quiet) + }, error=function(e) { + print(paste0("Error: ", e$message)) + return(NULL) + }) -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseDILIannotations <- function(DT) { - dataL <- DT$Data - names(dataL) <- DT$SourceID - dataL <- lapply(dataL, FUN=`[`, i=Name %like% 'DILI') - dataDT <- rbindlist(dataL, fill=TRUE, use.names=TRUE, idcol='SourceID') - dataDT[, DILI := unlist(Value.StringWithMarkup)] - annotationDT <- merge.data.table( - dataDT[, .(SourceID, DILI)], - DT[, .(SourceID, SourceName, Name, LinkedRecords.CID, - LinkedRecords.SID)], - by='SourceID') - DT <- annotationDT[, .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), - by=.(SourceName, SourceID, Name, DILI)] - return(DT) -} + .checkThrottlingStatus(result, throttleMessage = throttleMessage) + result <- parseJSON(result) + # switch(header, + # 'ATC Code'=return(.parseATCannotations(annotationDT)), + # 'Drug Induced Liver Injury'=return(.parseDILIannotations(annotationDT)), + # 'NSC Number'=return(.parseNSCannotations(annotationDT)), + # 'CTD Chemical-Gene Interactions'=return(.parseCTDannotations(annotationDT)), + # 'Names and Synonyms'=return(.parseNamesAndSynonyms(annotationDT)), + # 'Synonyms and Identifiers'=return(.parseSynonymsAndIdentifiers(annotationDT)), + # 'CAS'=return(.parseCASannotations(annotationDT)), + # tryCatch({ + # parseFUN(annotationDT) + # }, + # error=function(e) { + # .warning(funContext, 'The parseFUN function failed: ', e, + # '. Returning unparsed results instead. Please test the parseFUN + # on the returned data.') + # return(annotationDT) + # }) + # ) + + + if (header == 'ChEMBL ID') { + result <- .parseCHEMBLresponse(result) + }else if (header == 'NSC Number'){ + result <- .parseNSCresponse(result) + }else if (header == 'DILI' || header =='Drug Induced Liver Injury'){ + result <- .parseDILIresponse(result) + }else if (header == 'CAS'){ + result <- .parseCASresponse(result) + }else if (header == 'ATC Code'){ + result <- .parseATCresponse(result) + } + + # Using switch instead of if statements + result <- switch( + header, + 'ChEMBL ID' = .parseCHEMBLresponse(result), + 'NSC Number' = .parseNSCresponse(result), + 'DILI' = .parseDILIresponse(result), + 'CAS' = .parseCASresponse(result), + 'ATC Code' = .parseATCresponse(result) + ) + + if (is.null(result)) result <- list(compound, "N/A") + else result <- list(compound,result) + names(result) <- c("cid", header) + return(result) + } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseNSCannotations <- function(DT) { - DT[, NSC := unlist(lapply(Data, `[[`, i=4))] - annotationDT <- DT[, - .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)), - by=.(SourceName, SourceID, NSC)] - return(annotationDT) +#' Function that parses the results of the PubChem PUG-VIEW API for the CHEMBL ID header +.parseCHEMBLresponse <- function(response){ + result <- result$Record$Reference$SourceID + result <- gsub("::Compound", "", result) + return(result) } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseCTDannotations <- function(DT) { - annotationDT <- DT[, .(CID=unlist(LinkedRecords)), - by=.(SourceName, SourceID, URL)] - annotationDT[, CTD := gsub('::.*$', '', SourceID)] - return(annotationDT) +#' Function that parses the results of the PubChem PUG-VIEW API for the NSC Number header +.parseNSCresponse <- function(response){ + result <- result$Record$Reference$SourceID[1] + result <- gsub(" ", "", result) + return(result) } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames -.parseCASannotations <- function(list) { - # Make sure CIDs all go in the same column - CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords', - skip_absent=TRUE) - DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE) - DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))] - CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)] - ID_DT <- DT[, .( - CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x)), - SID=unlist(lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x))) - , by=.(SourceName, SourceID, Name, URL)] - annotationDT <- merge.data.table(CAS_DT, ID_DT, - by=c('SourceName', 'SourceID', 'Name'), all.x=TRUE) - return(annotationDT) +#' Function that parses the results of the PubChem PUG-VIEW API for the DILI header +.parseDILIresponse <- function(response){ + if(length(result$Record$Section) == 0){ + result <- "NA" + }else{ + dt_ <- as.data.table(result$Record$Section) + dt_ <- as.data.table(dt_)$Section[[1]] + dt_ <- as.data.table(dt_)$Section + dt_ <- as.data.table(dt_) + dt_ <- as.data.table(dt_)$Information + # print(as.data.table(dt_)[1:3, .(Name,unlist(Value))]) + section <- as.data.table(dt_)[1:3, "DILI" := paste0(unlist(Name), ":", unlist(Value))] + + # if any of the first 4 rows are NA, remove it + section <- section[!is.na(section$DILI)] + + + section <- paste0(section[1:3, DILI], collapse= "; ") + + # create a list for each row as Name:Value string with no spaces and no new lines + reference <- paste0("LTKBID:", result$Record$Reference$SourceID) + result <- c(section, reference) + result <- paste0(result, collapse = "; ") + } } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseSynonymsAndIdentifiers <- function(DT) { - dataList <- lapply(DT$Data, as.data.table) - names(dataList) <- DT$SourceID - dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE, - idcol='SourceID') - DT[, Data := NULL] - dataDT[, - Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'), - by=SourceID] - dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID] - dataDT[, Value.StringWithMarkup := NULL] - annotationDT <- merge.data.table(dataDT, DT, by='SourceID') - setnames(annotationDT, - old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'), - new=c('Type', 'Heading', 'ID') - ) - return(annotationDT) +#' Function that parses the results of the PubChem PUG-VIEW API for the CAS header +.parseCASresponse <- function(response){ + result <- result$Record$Reference$SourceID[1] + return(result) } -#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist -.parseNamesAndSynonyms <- function(DT) { - DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])] - # Remove the weird annotation from the end of the synonym - DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')] - DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))] - # fix NULL list itemss - DT[, CID := lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x)] - annotationDT <- DT[, .(CID=unlist(CID)), - by=.(SourceName, SourceID, Name, URL, Synonyms)] - return(annotationDT) -} +#' Function that parses the results of the PubChem PUG-VIEW API for the ATC Code header +.parseATCresponse <- function(response){ + if(length(result$Record$Section) == 0){ + result <- "NA" + + }else{dt_ <- as.data.table(result$Record$Section) + dt_ <- as.data.table(dt_)$Section[[1]] + dt_ <- as.data.table(dt_)$Information + dt_ <- as.data.table(dt_)$Value + dt_ <- as.data.table(dt_[[1]]) + result <- paste0("ATC:", dt_$String) + if(length(result) > 1){ + result <- paste0(result, collapse = "; ") + }} + + return(result) +} \ No newline at end of file diff --git a/R/getUniProt.R b/R/getUniProt.R index 1dee2a6..ab05110 100644 --- a/R/getUniProt.R +++ b/R/getUniProt.R @@ -30,25 +30,24 @@ getUniProt <- function(url='') { #' @return A httr::response object containing the query results as `format`. #' #' @import httr -#' @importFrom CoreGx .errorMsg .warnMsg # #' @export -queryUniProt <- function(query, columns='', limit='', offset='', format='xml', - include=TRUE, compress=TRUE, ..., - url='https://www.uniprot.org/uniprot') { - stop("Not implemented yet!") - if (missing(query)) stop(.errorMsg(.context(), 'The query parameter is - missing! This parameter is mandatory.')) +# queryUniProt <- function(query, columns='', limit='', offset='', format='xml', +# include=TRUE, compress=TRUE, ..., +# url='https://www.uniprot.org/uniprot') { +# stop("Not implemented yet!") +# if (missing(query)) stop(.errorMsg(.context(), 'The query parameter is +# missing! This parameter is mandatory.')) - ##TODO:: handle more errors +# ##TODO:: handle more errors - # parse parameters - include <- if (include) 'yes' else 'no' - compress <- if (compress) 'yes' else 'no' +# # parse parameters +# include <- if (include) 'yes' else 'no' +# compress <- if (compress) 'yes' else 'no' - response <- GET() -} +# response <- GET() +# } -#' +# #' #' #' #' diff --git a/R/queryCellosaurus.R b/R/queryCellosaurus.R index 222cd27..4eba2e6 100644 --- a/R/queryCellosaurus.R +++ b/R/queryCellosaurus.R @@ -92,125 +92,3 @@ queryCellosaurus <- function(url = "https://ftp.expasy.org/databases/cellosaurus ## ============================= CELLOSAURUS WORK ============================== # these functions will be put into annotationGx for ease: - -#' Create a list of query URLS for Cellosaurus API -#' -#' @description -#' This function creates a queryURL for the cellosaurus API using a list of cell line names -#' -#' @details -#' Function to create a URL query for Cellosaurus to search for a cell-line using its name -#' An example call: finalURLs <- createQueryURLs(api = CLurl, cl_names = cl_names, fields = c("id", "ac")) -#' @return A list of URLS -#' @param api is the link to the API to build the URL. i.e "https://api.cellosaurus.org/" -#' @param cl_names is a list of the cell line names -#' @param format is the type of format to return from the API. Can be "txt" or "json" -#' @param clnum is the number of of cell lines to return, DEFAULT=1 -#' @param GETfxn is the function to use on the cellosaurus website. Currently only supports "search/cell-line?" -#' @param fields is a list of desired fields to include in the response -#' -#' @md -#' -createQueryURLs <- - function(api, - cl_names, - format = "txt", - clnum = 1, - GETfxn = "search/cell-line?", - fields) { - # create urls - finalURLs <- paste0(api, - GETfxn, - "q=", - "idsy:", - gsub(" ", "%20",cl_names), - "&", - "rows=", - clnum, - "&", - "format=",format, - "&", - "fields=", paste(fields, collapse=",") - ) - return(finalURLs) -} - -#' Query Cellosaurus -#' -#' @description -#' This function takes a list of cell line names and gets responses from the Cellosaurus API -#' -#' @details -#' Function to get responses from Cellosaurus API -#' -#' @return A list of responses -#' @param cl_names is a list of the cell line names -#' -#' @md -#' @export -#' -getCellApi <- - function(cl_names){ - CLurl <- "https://api.cellosaurus.org/" - - fields <- c("id", # Recommended name. Most frequently the name of the cell line as provided in the original publication. - "ac", # Primary accession. It is the unique identifier of the cell line. It is normally stable across Cellosaurus versions ... - "sy", # List of synonyms. - "misspelling", # Identified misspelling(s) of the cell line name - "din", # Disease(s) suffered by the individual from which the cell line originated with its NCI Thesaurus or ORDO identifier. - "ca", # Category to which a cell line belongs, one of 14 defined terms. Example: cancer cell line, hybridoma, transformed cell line. - "sx", # Sex - "ag", # Age at sampling time of the individual from which the cell line was established. - "sampling-site", # Body part, organ, cell-type the cell line is derived from - "metastatic-site" # Body part, organ the cell line is derived from in the case of a cell line originating from a cancer metastasis. - # "cc" # comments - ) - - finalURLs <- createQueryURLs(api = CLurl, cl_names = cl_names, fields = fields) - responseList <- bplapply(finalURLs, function(x) GET(x)) - names(responseList) <- cl_names - return(responseList) -} - -#' Clean cellosaurus responses -#' -#' @description -#' This function takes a list of Cellosaurus Responses and cleans them for use -#' -#' @details This function takes a list of Cellosaurus Responses and cleans them for use -#' @return A list of responses -#' @param responseList is a list of responses -#' -#' @md -#' @export -#' -cleanCellosaurusResponse <- - function(responseList){ - # Get content of each response, then separate content on newline character - responseContent <- lapply(lapply(responseList, httr::content), - function(x) strsplit(x=x, split="\n")) - - #Remove first 15 rows of content - responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)]) - - # Split on first " " appearance - responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x, split = " .")) - - # rbind responses (do.call returns as one large matrix instead of dataframe) - df_ <- lapply(responseContent_sub_split, function(x) do.call(rbind, x)) - - # convert each response from matrix to data.table - df_2 <- lapply(df_, function(x) data.table(x)) - - # rbinds all responses (creates new column so all of cell line x will have its name in col cellLine) - df_3<- rbindlist(df_2, idcol = "cellLine") - df_3 <- df_3[V1!="//"] - - # Collapse all rows with the same cellLine & V1 (most often rows for cc) and separate by "; " - df_4 <- df_3[, list(data = paste0(unique(na.omit(V2)), collapse ="; ")), by = c("cellLine", "V1")] - - # transpose - df_5 <- dcast(df_4, cellLine ~ ...) - - return(df_5) -} \ No newline at end of file diff --git a/build/README.md b/build/README.md new file mode 100644 index 0000000..1fbcec0 --- /dev/null +++ b/build/README.md @@ -0,0 +1,19 @@ +Set env variables for dockerfile: + +``` bash +export DOCKER_REPO="jjjermiah" +export DOCKER_IMAGE_NAME="annotationgx-r" +export DOCKER_TAG="0.1" +``` + +Commands to build the docker container: + +``` bash +docker build -t $DOCKER_REPO/$DOCKER_IMAGE_NAME:$DOCKER_TAG -f build/annotationGx.Dockerfile . +``` +Command to push to dockerhub: + +``` bash +docker push $DOCKER_REPO/$DOCKER_IMAGE_NAME:$DOCKER_TAG + +``` diff --git a/build/annotationGx.Dockerfile b/build/annotationGx.Dockerfile new file mode 100644 index 0000000..8d2e1a6 --- /dev/null +++ b/build/annotationGx.Dockerfile @@ -0,0 +1,37 @@ +# This file is for building the annotationGx image for linux + +FROM bioconductor/bioconductor_docker:3.17-R-4.3.0 + +# Install required libraries -- using prebuild binaries where available +# RUN apt-get update && apt-get install -y \ +# git \ +# r-cran-data.table \ +# r-cran-doparallel \ +# r-cran-dygraphs \ +# r-cran-foreach \ +# r-cran-fs \ +# r-cran-future.apply \ +# r-cran-gh \ +# r-cran-git2r \ +# r-cran-igraph \ +# r-cran-memoise \ +# r-cran-png \ +# r-cran-rcpparmadillo \ +# r-cran-rex \ +# r-cran-runit \ +# r-cran-stringdist \ +# r-cran-testthat \ +# r-cran-tidyverse \ +# r-cran-tinytest \ +# r-cran-xts \ +# sqlite3 \ +# sudo + +# Install R packages +# RUN install2.r --error --deps TRUE BiocManager + + + +RUN rm -rf /tmp/* + +CMD ["R"] \ No newline at end of file diff --git a/build/buildimage.sh b/build/buildimage.sh new file mode 100755 index 0000000..f02d1b0 --- /dev/null +++ b/build/buildimage.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +export DOCKER_REPO="jjjermiah" +export DOCKER_IMAGE_NAME="annotationgx-r" +export DOCKER_TAG="0.1" + +docker build -t $DOCKER_REPO/$DOCKER_IMAGE_NAME:$DOCKER_TAG -f build/annotationGx.Dockerfile . + diff --git a/man/cleanCellnames.Rd b/man/cleanCellnames.Rd new file mode 100644 index 0000000..9329132 --- /dev/null +++ b/man/cleanCellnames.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getCellosaurus.R +\name{cleanCellnames} +\alias{cleanCellnames} +\title{Clean Cell Names} +\usage{ +cleanCellnames(main_xml, verbose = TRUE) +} +\description{ +TODO:: +} diff --git a/man/cleanCellosaurusResponse.Rd b/man/cleanCellosaurusResponse.Rd index d9344a4..7797d54 100644 --- a/man/cleanCellosaurusResponse.Rd +++ b/man/cleanCellosaurusResponse.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/queryCellosaurus.R +% Please edit documentation in R/getCellosaurusAPI.R \name{cleanCellosaurusResponse} \alias{cleanCellosaurusResponse} \title{Clean cellosaurus responses} \usage{ -cleanCellosaurusResponse(responseList) +cleanCellosaurusResponse( + responseList, + GETfxn = c("search/cell-line?", "cell-line/") +) } \arguments{ \item{responseList}{is a list of responses} diff --git a/man/dot-checkThrottlingStatus.Rd b/man/dot-checkThrottlingStatus.Rd index f3eafde..cd83fca 100644 --- a/man/dot-checkThrottlingStatus.Rd +++ b/man/dot-checkThrottlingStatus.Rd @@ -1,14 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getPubChem.R +% Please edit documentation in R/getPubChem-helpers.R \name{.checkThrottlingStatus} \alias{.checkThrottlingStatus} \title{Checks to see if the PubChem query is exceeding the throttling limit} \usage{ -.checkThrottlingStatus(response) +.checkThrottlingStatus(result, throttleMessage = FALSE) } \arguments{ +\item{throttleMessage}{\code{logical} whether to print the throttling message} + \item{response}{\code{httr::response}} } +\value{ +\code{logical} whether the query is throttled +} \description{ Checks to see if the PubChem query is exceeding the throttling limit } diff --git a/man/createQueryURLs.Rd b/man/dot-createQueryURLs.Rd similarity index 63% rename from man/createQueryURLs.Rd rename to man/dot-createQueryURLs.Rd index 78169f9..abc16c5 100644 --- a/man/createQueryURLs.Rd +++ b/man/dot-createQueryURLs.Rd @@ -1,16 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/queryCellosaurus.R -\name{createQueryURLs} -\alias{createQueryURLs} +% Please edit documentation in R/getCellosaurusAPI.R +\name{.createQueryURLs} +\alias{.createQueryURLs} \title{Create a list of query URLS for Cellosaurus API} \usage{ -createQueryURLs( - api, +.createQueryURLs( + api = "https://api.cellosaurus.org/", cl_names, format = "txt", - clnum = 1, - GETfxn = "search/cell-line?", - fields + num_results = 1, + GETfxn = c("search/cell-line?", "cell-line/"), + fields, + q = "idsy:" ) } \arguments{ @@ -20,7 +21,7 @@ createQueryURLs( \item{format}{is the type of format to return from the API. Can be "txt" or "json"} -\item{clnum}{is the number of of cell lines to return, DEFAULT=1} +\item{num_results}{is the number of of items to return, DEFAULT=1} \item{GETfxn}{is the function to use on the cellosaurus website. Currently only supports "search/cell-line?"} @@ -34,5 +35,5 @@ This function creates a queryURL for the cellosaurus API using a list of cell li } \details{ Function to create a URL query for Cellosaurus to search for a cell-line using its name -An example call: finalURLs <- createQueryURLs(api = CLurl, cl_names = cl_names, fields = c("id", "ac")) +An example call: computedURLs <- .createQueryURLs(api = "https://api.cellosaurus.org/", cl_names = c("22rv1", "Hela"), fields = c("id", "ac")) } diff --git a/man/dot-parseATCresponse.Rd b/man/dot-parseATCresponse.Rd new file mode 100644 index 0000000..a6d6a4f --- /dev/null +++ b/man/dot-parseATCresponse.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.parseATCresponse} +\alias{.parseATCresponse} +\title{Function that parses the results of the PubChem PUG-VIEW API for the ATC Code header} +\usage{ +.parseATCresponse(response) +} +\description{ +Function that parses the results of the PubChem PUG-VIEW API for the ATC Code header +} diff --git a/man/dot-parseCASresponse.Rd b/man/dot-parseCASresponse.Rd new file mode 100644 index 0000000..2bc0510 --- /dev/null +++ b/man/dot-parseCASresponse.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.parseCASresponse} +\alias{.parseCASresponse} +\title{Function that parses the results of the PubChem PUG-VIEW API for the CAS header} +\usage{ +.parseCASresponse(response) +} +\description{ +Function that parses the results of the PubChem PUG-VIEW API for the CAS header +} diff --git a/man/dot-parseCHEMBLresponse.Rd b/man/dot-parseCHEMBLresponse.Rd new file mode 100644 index 0000000..7404b71 --- /dev/null +++ b/man/dot-parseCHEMBLresponse.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.parseCHEMBLresponse} +\alias{.parseCHEMBLresponse} +\title{Function that parses the results of the PubChem PUG-VIEW API for the CHEMBL ID header} +\usage{ +.parseCHEMBLresponse(response) +} +\description{ +Function that parses the results of the PubChem PUG-VIEW API for the CHEMBL ID header +} diff --git a/man/dot-parseDILIresponse.Rd b/man/dot-parseDILIresponse.Rd new file mode 100644 index 0000000..d1a2edd --- /dev/null +++ b/man/dot-parseDILIresponse.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.parseDILIresponse} +\alias{.parseDILIresponse} +\title{Function that parses the results of the PubChem PUG-VIEW API for the DILI header} +\usage{ +.parseDILIresponse(response) +} +\description{ +Function that parses the results of the PubChem PUG-VIEW API for the DILI header +} diff --git a/man/dot-parseNSCresponse.Rd b/man/dot-parseNSCresponse.Rd new file mode 100644 index 0000000..4bc5180 --- /dev/null +++ b/man/dot-parseNSCresponse.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{.parseNSCresponse} +\alias{.parseNSCresponse} +\title{Function that parses the results of the PubChem PUG-VIEW API for the NSC Number header} +\usage{ +.parseNSCresponse(response) +} +\description{ +Function that parses the results of the PubChem PUG-VIEW API for the NSC Number header +} diff --git a/man/downloadAndExtract.Rd b/man/downloadAndExtract.Rd index 7d0735d..66fa13f 100644 --- a/man/downloadAndExtract.Rd +++ b/man/downloadAndExtract.Rd @@ -24,5 +24,5 @@ the specified \code{extract_fun} for more details.} Download a compressed file from a remote URL and extract it. } \seealso{ -\link[utils:unzip]{utils::unzip}, \link[utils:untar]{utils::untar}, \link[R.utils:compressFile]{R.utils::gunzip}, \link[R.utils:compressFile]{R.utils::bunzip2} +\link[utils:unzip]{utils::unzip}, \link[utils:untar]{utils::untar}, \link[R.utils:gunzip]{R.utils::gunzip}, \link[R.utils:bunzip2]{R.utils::bunzip2} } diff --git a/man/getPubChemAnnotations.Rd b/man/getAllPubChemAnnotations.Rd similarity index 91% rename from man/getPubChemAnnotations.Rd rename to man/getAllPubChemAnnotations.Rd index b7c86c7..d0dca8a 100644 --- a/man/getPubChemAnnotations.Rd +++ b/man/getAllPubChemAnnotations.Rd @@ -1,19 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/getPubChem.R -\name{getPubChemAnnotations} -\alias{getPubChemAnnotations} +\name{getAllPubChemAnnotations} +\alias{getAllPubChemAnnotations} \title{Get a selected annotation for all PubChem entries} \usage{ -getPubChemAnnotations( +getAllPubChemAnnotations( header = "Available", type = "Compound", parseFUN = identity, - ..., output = "JSON", raw = FALSE, + rawAnnotationDT = FALSE, + verbose = FALSE, url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading", BPPARAM = bpparam(), - proxy = FALSE + proxy = FALSE, + retries = 3, + maxPages = NA, + ... ) } \arguments{ @@ -30,8 +34,6 @@ Defaults to identity, i.e., it returned the results unparsed. Some default parsing is implemented inside the function for 'ATC Code' and 'Drug Induced Liver Injury' headers.} -\item{...}{Force subsequent parameters to be named. Not used.} - \item{output}{\code{character(1)} The output format. Defaults to 'JSON'. For options other than 'JSON', you must set \code{raw=TRUE} or the fuction will fail.} @@ -41,6 +43,8 @@ developer use only and should not be changed.} \item{BPPARAM}{\code{BiocParallelParam} A BiocParallel back-end to parallelize with. Defaults to \code{bpparam()}. To run in serial, set to \code{SerialParam()}.} + +\item{...}{Force subsequent parameters to be named. Not used.} } \value{ A \code{data.table} of resulting annotations. If the header is not diff --git a/man/getCellApi.Rd b/man/getCellApi.Rd deleted file mode 100644 index 7f37f22..0000000 --- a/man/getCellApi.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/queryCellosaurus.R -\name{getCellApi} -\alias{getCellApi} -\title{Query Cellosaurus} -\usage{ -getCellApi(cl_names) -} -\arguments{ -\item{cl_names}{is a list of the cell line names} -} -\value{ -A list of responses -} -\description{ -This function takes a list of cell line names and gets responses from the Cellosaurus API -} -\details{ -Function to get responses from Cellosaurus API -} diff --git a/man/getCellosaurusAPI.Rd b/man/getCellosaurusAPI.Rd new file mode 100644 index 0000000..54c3394 --- /dev/null +++ b/man/getCellosaurusAPI.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getCellosaurusAPI.R +\name{getCellosaurusAPI} +\alias{getCellosaurusAPI} +\title{Query Cellosaurus} +\usage{ +getCellosaurusAPI( + cl_names, + fields = c("id", "ac", "sy", "misspelling", "din", "ca", "sx", "ag", "sampling-site", + "metastatic-site"), + GETfxn = c("search/cell-line?", "cell-line/"), + querydomain = "ac:" +) +} +\arguments{ +\item{cl_names}{is a list of the cell line names} + +\item{fields}{is a list of desired fields to obtain for each cell line in the API query, i.e if only trying to get synonynms and primary accesssion then fields=c("sy", "ac"). see https://api.cellosaurus.org/static/fields_help.html for all fields.} +} +\value{ +A list of responses +} +\description{ +This function takes a list of cell line names and interested fields and gets responses from the Cellosaurus API +} +\details{ +Function to get responses from Cellosaurus API +} diff --git a/man/getPubChemAnnotation.Rd b/man/getPubChemAnnotation.Rd new file mode 100644 index 0000000..80a751e --- /dev/null +++ b/man/getPubChemAnnotation.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getPubChem.R +\name{getPubChemAnnotation} +\alias{getPubChemAnnotation} +\title{getPubChemAnnotation} +\usage{ +getPubChemAnnotation( + compound, + header = "ChEMBL ID", + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound", + output = "JSON", + timeout_s = 29, + retries = 3, + quiet = TRUE, + throttleMessage = FALSE +) +} +\arguments{ +\item{compound}{\code{character(1)} A valid CID to use for the query.} + +\item{header}{\code{character(1)} A valid header name for the PUG VIEW annotations} + +\item{url}{\code{character(1)} The URL to perform API queries on. default = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound'} + +\item{output}{\code{character(1)} The output format. Defaults to 'JSON'.} + +\item{timeout_s}{\code{numeric(1)} The number of seconds to wait before timing out. Default is 29.} + +\item{retries}{\code{numeric(1)} The number of times to retry a failed query. Default is 3.} + +\item{quiet}{\code{logical(1)} Should the function be quiet? Default is TRUE.} + +\item{throttleMessage}{\code{logical(1)} Should a message be printed when the query is throttled? Default is FALSE.} +} +\description{ +queries the PubChem PUG-VIEW API to get a single annotation using a CID for a header +} diff --git a/man/getRequestPubChem.Rd b/man/getRequestPubChem.Rd index 00e7c6b..90187c9 100644 --- a/man/getRequestPubChem.Rd +++ b/man/getRequestPubChem.Rd @@ -15,7 +15,8 @@ getRequestPubChem( operation_options = NA, proxy = FALSE, raw = FALSE, - query_only = FALSE + query_only = FALSE, + verbose = FALSE ) } \arguments{ diff --git a/man/queryPubChem.Rd b/man/queryPubChem.Rd index 2979e2a..cd486da 100644 --- a/man/queryPubChem.Rd +++ b/man/queryPubChem.Rd @@ -16,7 +16,8 @@ queryPubChem( batch = TRUE, raw = FALSE, proxy = FALSE, - query_only = FALSE + query_only = FALSE, + verbose = FALSE ) } \arguments{ diff --git a/man/queryUniProt.Rd b/man/queryUniProt.Rd deleted file mode 100644 index 2143b1c..0000000 --- a/man/queryUniProt.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getUniProt.R -\name{queryUniProt} -\alias{queryUniProt} -\title{Query the UniProt website REST API} -\usage{ -queryUniProt( - query, - columns = "", - limit = "", - offset = "", - format = "xml", - include = TRUE, - compress = TRUE, - ..., - url = "https://www.uniprot.org/uniprot" -) -} -\arguments{ -\item{query}{A \code{character} vector of the API query to return. See -https://www.uniprot.org/help/text-search for information on query -structure.} - -\item{format}{A \code{character} vector specifying the return format. Options -are 'html', 'tab', 'xls', 'fasta', 'gff', 'txt', 'xml', 'rdf', 'list' or -'rss'. Default is 'xml'. See https://www.uniprot.org/help/api\%5Fqueries -for more information.} - -\item{include}{A \code{boolean} indicating if isoforms or description of -referenced data should be included.} - -\item{compress}{TODO::} - -\item{...}{Catch all unnamed arguments. All parameters after ... must be -specified with as name=value or they will be ignored.} - -\item{url}{A \code{character} vector of the URL for the REST API. Should not -include trailing '/', this will be added inside the function. This -parameter must be passed named or it will not work.} -} -\value{ -A httr::response object containing the query results as \code{format}. -} -\description{ -Query the UniProt website REST API -}