diff --git a/DESCRIPTION b/DESCRIPTION index 388da8b..f591170 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: AnnotationGx Title: AnnotationGx: A package for building, updating and querying an annotation database for pharmaco-genomic data -Version: 0.0.0.9077 +Version: 0.0.0.9080 Authors@R: c( person("Jermiah", "Joseph", role = c("aut", "cre"), email = "jermiah.joseph@gmail.com"), diff --git a/NAMESPACE b/NAMESPACE index 5cb5eb2..7f45ab5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,9 +10,11 @@ export(getPubchemAnnotationHeadings) export(getPubchemCompound) export(getPubchemProperties) export(getPubchemStatus) +export(getUnichemSources) export(mapCID2Properties) export(mapCell2Accession) export(mapCompound2CID) +export(queryUnichem) export(standardize_names) import(BiocParallel) import(data.table) diff --git a/R/unichem.R b/R/unichem.R new file mode 100644 index 0000000..82e64a6 --- /dev/null +++ b/R/unichem.R @@ -0,0 +1,126 @@ + +# Unichem API documentation: https://www.ebi.ac.uk/unichem/info/webservices + + +#' Get the list of sources in UniChem. +#' +#' Returns a `data.table` with the following columns: +#' - `CompoundCount` (integer): Total of compounds provided by that source +#' - `BaseURL` (string): Source Base URL for compounds +#' - `Description` (string): Source database description +#' - `LastUpdated` (string): Date in which the source database was last updated +#' - `Name` (string): Short name of the source database +#' - `NameLabel` (string): Machine readable label name of the source database +#' - `NameLong` (string): Full name of the source database +#' - `SourceID` (integer): Unique ID for the source database +#' - `Details` (string): Notes about the source +#' - `ReleaseDate` (string): Date in which the source database was released +#' - `ReleaseNumber` (integer): Release number of the source database data stored in UniChEM +#' - `URL` (string): Main URL for the source +#' - `UpdateComments` (string): Notes about the update process of that source to UniChEM +#' +#' +#' @return A data.table with the list of sources in UniChem. +#' +#' @export +getUnichemSources <- function() { + funContext <- .funContext("AnnotationGx::getUnichemSources") + + response <- .build_unichem_query("sources") |> + .build_request() |> + .perform_request() |> + .parse_resp_json() + + if(response$response != "Success"){ + .err(funContext, "Unichem API request failed.") + } + + .debug(funContext, sprintf("Unichem sourceCount: %s", response$totalSources)) + + sources_dt <- .asDT(response$sources) + + old_names <- c( + "UCICount", "baseIdUrl", "description", "lastUpdated", "name", + "nameLabel", "nameLong", "sourceID", "srcDetails", "srcReleaseDate", + "srcReleaseNumber", "srcUrl", "updateComments") + + new_names <- c( + "CompoundCount", "BaseURL", "Description", "LastUpdated", "Name", + "NameLabel", "NameLong", "SourceID", "Details", "ReleaseDate", + "ReleaseNumber", "URL", "UpdateComments") + + setnames(sources_dt, old_names, new_names) + + new_order <- c( + "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", + "BaseURL", "URL", "Details", + "Description", "ReleaseNumber", "ReleaseDate", "LastUpdated", + "UpdateComments" + ) + + sources_dt[, ..new_order] + +} + +#' Query UniChem for a compound. +#' +#' This function queries the UniChem API for a compound based on the provided parameters. +#' +#' @param type `character` The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID". +#' @param compound `character` or `integer` The compound identifier to search for. +#' @param sourceID `integer` The source ID to search for if the type is "sourceID". Defaults to NULL. +#' @param request_only `boolean` Whether to return the request only. Defaults to FALSE. +#' @param raw `boolean` Whether to return the raw response. Defaults to FALSE. +#' @param ... Additional arguments. +#' +#' @return A list with the external mappings and the UniChem mappings. +#' +#' @examples +#' queryUnichem(type = "sourceID", compound = "444795", sourceID = 22) +#' +#' @export +queryUnichem <- function( + type, compound, sourceID = NA_integer_, request_only = FALSE, raw = FALSE, ... +){ + checkmate::assert_string(type) + checkmate::assert_atomic(compound) + checkmate::assert_integerish(sourceID) + checkmate::assertLogical(request_only) + checkmate::assertLogical(raw) + + request <- .build_unichem_compound_req(type, compound, sourceID,...) + if(request_only) return(request) + + response <- request |> + .perform_request() |> + .parse_resp_json() + + if(raw) return(response) + + if(response$response != "Success"){ + .err("Unichem API request failed.") + } + + # Mapping names to be consistent with other API calls + mapped_sources_dt <- .asDT(response$compounds$sources) + old_names <- c("compoundId", "shortName", "longName", "id", "url") + new_names <- c("compoundID", "Name", "NameLong", "sourceID", "sourcURL") + setnames(mapped_sources_dt, old = old_names, new = new_names) + + External_Mappings <- mapped_sources_dt[, ..new_names] + + UniChem_Mappings <- list( + UniChem.UCI = response$compounds$uci, + UniChem.InchiKey = response$compounds$standardInchiKey, + UniChem.Inchi = response$compounds$inchi$inchi, + UniChem.formula = response$compounds$inchi$formula, + UniChem.connections = response$compounds$inchi$connections, + UniChem.hAtoms = response$compounds$inchi$hAtoms + ) + + list( + External_Mappings = External_Mappings, + UniChem_Mappings = UniChem_Mappings + ) + +} \ No newline at end of file diff --git a/R/unichem_helpers.R b/R/unichem_helpers.R new file mode 100644 index 0000000..a27398f --- /dev/null +++ b/R/unichem_helpers.R @@ -0,0 +1,87 @@ +#' Build a UniChem query URL +#' +#' This function builds a UniChem query URL based on the specified endpoint. +#' +#' @param endpoint The UniChem endpoint to query (valid options: "compounds", "connectivity", "images", "sources") +#' @param query_only Logical indicating whether to return only the query URL without building it (default: FALSE) +#' +#' @return `httr2::httr2_url` object if `query_only` is TRUE, otherwise the built URL. +#' +#' @examples +#' .build_unichem_query("sources") +#' .build_unichem_query("connectivity", query_only = TRUE) +#' +#' @noRd +#' @keywords internal +.build_unichem_query <- function( + endpoint, query_only = FALSE +) { + funContext <- .funContext("AnnotationGx:::.build_unichem_query") + + valid_endpoints <- c("compounds", "connectivity", "images", "sources") + checkmate::assert_subset(endpoint, valid_endpoints) + + unichem_api <- "https://www.ebi.ac.uk/unichem/api/v1" + url <- httr2::url_parse(unichem_api) + url$path <- .buildURL(url$path, endpoint) + + .debug(funContext, "URL: ", capture.output(show(url))) + + if (query_only) return(url) + + return(httr2::url_build(url)) +} + + +#' Build a UniChem compound request +#' +#' This function builds a UniChem compound request based on the provided parameters. +#' +#' @param type The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID". +#' @param compound The compound identifier to search for. +#' @param sourceID The source ID to search for if the type is "sourceID". Defaults to NULL. +#' @param ... Additional arguments. +#' +#' @return A `httr2_request` request object for the UniChem compound query. +#' +#' @examples +#' .build_unichem_compound_req(type = "uci", compound = "538323") +#' .build_unichem_compound_req(type = "sourceID", sourceID = 22, compound = "2244") +#' +#' @noRd +#' @keywords internal +.build_unichem_compound_req <- function( + type, compound, sourceID = NULL, ... +){ + funContext <- .funContext("AnnotationGx:::.build_unichem_compound_req") + + valid_types <- c("uci", "inchi", "inchikey", "sourceID") + checkmate::assert_subset(type, valid_types) + + base_url <- .build_unichem_query("compounds") + + .debug(funContext, "Base URL: ", capture.output(show(base_url))) + + body <- list( + type = type, + compound = compound + ) + + body$sourceID <- if (type == "sourceID") { + checkmate::assert_integerish( + x = sourceID, + lower = 1, + upper = max(getUnichemSources()$SourceID), + len = 1 + ) + sourceID + } else NULL + + + request <- base_url |> + .build_request() |> + httr2::req_body_json(body) + + .debug(funContext, "Request: ", capture.output(show(request))) + return(request) +} diff --git a/man/getUnichemSources.Rd b/man/getUnichemSources.Rd new file mode 100644 index 0000000..6822f0e --- /dev/null +++ b/man/getUnichemSources.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unichem.R +\name{getUnichemSources} +\alias{getUnichemSources} +\title{Get the list of sources in UniChem.} +\usage{ +getUnichemSources() +} +\value{ +A data.table with the list of sources in UniChem. +} +\description{ +Returns a \code{data.table} with the following columns: +\itemize{ +\item \code{CompoundCount} (integer): Total of compounds provided by that source +\item \code{BaseURL} (string): Source Base URL for compounds +\item \code{Description} (string): Source database description +\item \code{LastUpdated} (string): Date in which the source database was last updated +\item \code{Name} (string): Short name of the source database +\item \code{NameLabel} (string): Machine readable label name of the source database +\item \code{NameLong} (string): Full name of the source database +\item \code{SourceID} (integer): Unique ID for the source database +\item \code{Details} (string): Notes about the source +\item \code{ReleaseDate} (string): Date in which the source database was released +\item \code{ReleaseNumber} (integer): Release number of the source database data stored in UniChEM +\item \code{URL} (string): Main URL for the source +\item \code{UpdateComments} (string): Notes about the update process of that source to UniChEM +} +} diff --git a/man/queryUnichem.Rd b/man/queryUnichem.Rd new file mode 100644 index 0000000..2e7b1e9 --- /dev/null +++ b/man/queryUnichem.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unichem.R +\name{queryUnichem} +\alias{queryUnichem} +\title{Query UniChem for a compound.} +\usage{ +queryUnichem( + type, + compound, + sourceID = NA_integer_, + request_only = FALSE, + raw = FALSE, + ... +) +} +\arguments{ +\item{type}{\code{character} The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID".} + +\item{compound}{\code{character} or \code{integer} The compound identifier to search for.} + +\item{sourceID}{\code{integer} The source ID to search for if the type is "sourceID". Defaults to NULL.} + +\item{request_only}{\code{boolean} Whether to return the request only. Defaults to FALSE.} + +\item{raw}{\code{boolean} Whether to return the raw response. Defaults to FALSE.} + +\item{...}{Additional arguments.} +} +\value{ +A list with the external mappings and the UniChem mappings. +} +\description{ +This function queries the UniChem API for a compound based on the provided parameters. +} +\examples{ +queryUnichem(type = "sourceID", compound = "444795", sourceID = 22) + +} diff --git a/tests/testthat/test_unichem.R b/tests/testthat/test_unichem.R new file mode 100644 index 0000000..dfba62a --- /dev/null +++ b/tests/testthat/test_unichem.R @@ -0,0 +1,73 @@ +library(testthat) +library(AnnotationGx) +library(checkmate) + +test_that("getUnichemSources returns a data.table with the correct columns", { + sources <- getUnichemSources() + + expected_columns <- c( + "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", + "BaseURL", "URL", "Details", "Description", "ReleaseNumber", + "ReleaseDate", "LastUpdated", "UpdateComments" + ) + + expect_data_table( + sources, + all.missing = FALSE, + min.rows = 40, # As of March 2024 + min.cols = 13, # As of March 2024 + col.names = 'named', + info = "The data.table should have the correct columns. + The min number of rows and columns may change over time and is set on + from UniChem as of March 2024.", + ) +}) + + +test_that("queryUnichem returns the expected results", { + # Test case 1 + result1 <- queryUnichem(type = "sourceID", compound = "444795", sourceID = 22) + expect_true(is.list(result1)) + expect_true("External_Mappings" %in% names(result1)) + expect_true("UniChem_Mappings" %in% names(result1)) + + # Test case 2 + expect_error(queryUnichem(type = "inchikey", compound = "InchiKey123")) + +}) + +test_that("queryUnichem returns the expected results 2", { + # Test case 1 + result1 <- queryUnichem(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = T) + + expect_true(is.list(result1)) + + + checkmate::expect_names( + names(result1), + subset.of=c("compounds", "notFound", "response", "totalCompounds")) + + checkmate::expect_names( + names(result1$compounds), + subset.of=c("inchi", "sources", "standardInchiKey", "uci") + ) + + result2 <- queryUnichem(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = F) + + expect_true(is.list(result2)) + + checkmate::expect_names( + names(result2$External_Mappings), + subset.of = c("compoundID", "Name", "NameLong", "sourceID", "sourcURL") + ) + + checkmate::expect_names( + names(result2$UniChem_Mappings), + subset.of = c( + "UniChem.UCI", "UniChem.InchiKey", 'UniChem.Inchi', + 'UniChem.formula','UniChem.connections','UniChem.hAtoms' + ) + ) + + +}) \ No newline at end of file diff --git a/tests/testthat/test_unichem_helpers.R b/tests/testthat/test_unichem_helpers.R new file mode 100644 index 0000000..db48640 --- /dev/null +++ b/tests/testthat/test_unichem_helpers.R @@ -0,0 +1,74 @@ +library(testthat) +library(AnnotationGx) +library(checkmate) + +test_that("Valid endpoint returns correct URL", { + endpoint <- "compounds" + expected_url <- "https://www.ebi.ac.uk/unichem/api/v1/compounds" + actual_url <- .build_unichem_query(endpoint) + expect_equal(actual_url, expected_url) +}) + +test_that("Invalid endpoint throws an error", { + endpoint <- "invalid_endpoint" + expect_error(.build_unichem_query(endpoint)) +}) + +test_that("Query only option returns httr2::httr2_url object", { + endpoint <- "images" + query_only <- TRUE + expected_class <- "httr2_url" + actual_url <- .build_unichem_query(endpoint, query_only) + expect_class(actual_url, expected_class) +}) + + +test_that("Valid compound request is built correctly", { + type <- "uci" + compound <- "538323" + expected_url <- "https://www.ebi.ac.uk/unichem/api/v1/compounds" + expected_body <- list( + type = type, + compound = compound + ) + actual_request <- .build_unichem_compound_req(type, compound) + expect_equal(actual_request$url, expected_url) + expect_equal(actual_request$body$data, expected_body) +}) + +test_that("Valid sourceID compound request is built correctly", { + type <- "sourceID" + compound <- "2244" + sourceID <- 22 + expected_url <- "https://www.ebi.ac.uk/unichem/api/v1/compounds" + expected_body <- list( + type = type, + compound = compound, + sourceID = sourceID + ) + actual_request <- .build_unichem_compound_req(type, compound, sourceID) + expect_equal(actual_request$url, expected_url) + expect_equal(actual_request$body$data, expected_body) + + + response <- actual_request |> + .perform_request() |> + .parse_resp_json() + + checkmate::expect_names( + names(response), + subset.of=c("compounds", "notFound", "response", "totalCompounds")) + + checkmate::expect_names( + names(response$compounds), + subset.of=c("inchi", "sources", "standardInchiKey", "uci") + ) + + +}) + +test_that("Invalid type throws an error", { + type <- "invalid_type" + compound <- "538323" + expect_error(.build_unichem_compound_req(type, compound)) +}) diff --git a/vignettes/treatment_pipeline.Rmd b/vignettes/treatment_pipeline.Rmd new file mode 100644 index 0000000..47bd9c4 --- /dev/null +++ b/vignettes/treatment_pipeline.Rmd @@ -0,0 +1,45 @@ +--- +title: "Annotating Treatments Pipeline" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Annotating Treatments Pipeline} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# THIS VIGNETTE IS A WORK IN PROGRESS + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(AnnotationGx) +``` + +```{r mapCompounds to CID} + +data(ctrp_treatmentIDs) +treatmentMetadata <- ctrp_treatmentIDs[1:5] +treatmentMetadata +names_to_cids <- AnnotationGx::mapCompound2CID(treatmentMetadata$CTRP.treatmentid, first = TRUE) + + +``` + + +```{r use CID in unichem} + +sources <- getUnichemSources() +response <- queryUnichem( + type = "sourceID", + compound = names_to_cids[1, cids], + sourceID = sources[Name == "pubchem", SourceID] +) + +response +``` +