Skip to content

Commit

Permalink
feat: add unichem functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
jjjermiah committed Mar 11, 2024
1 parent ea38d7b commit c8d88cc
Show file tree
Hide file tree
Showing 7 changed files with 464 additions and 0 deletions.
124 changes: 124 additions & 0 deletions R/unichem.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@

# Unichem API documentation: https://www.ebi.ac.uk/unichem/info/webservices


#' Get the list of sources in UniChem.
#'
#' Returns a `data.table` with the following columns:
#' - `CompoundCount` (integer): Total of compounds provided by that source
#' - `BaseURL` (string): Source Base URL for compounds
#' - `Description` (string): Source database description
#' - `LastUpdated` (string): Date in which the source database was last updated
#' - `Name` (string): Short name of the source database
#' - `NameLabel` (string): Machine readable label name of the source database
#' - `NameLong` (string): Full name of the source database
#' - `SourceID` (integer): Unique ID for the source database
#' - `Details` (string): Notes about the source
#' - `ReleaseDate` (string): Date in which the source database was released
#' - `ReleaseNumber` (integer): Release number of the source database data stored in UniChEM
#' - `URL` (string): Main URL for the source
#' - `UpdateComments` (string): Notes about the update process of that source to UniChEM
#'
#'
#' @return A data.table with the list of sources in UniChem.
#'
#' @export
getUnichemSources <- function() {
response <- .build_unichem_query("sources") |>
.build_request() |>
.perform_request() |>
.parse_resp_json()

if(response$response != "Success"){
.err("Unichem API request failed.")
}

.debug(sprintf("Unichem sourceCount: %s", response$totalSources))

sources_dt <- .asDT(response$sources)

old_names <- c(
"UCICount", "baseIdUrl", "description", "lastUpdated", "name",
"nameLabel", "nameLong", "sourceID", "srcDetails", "srcReleaseDate",
"srcReleaseNumber", "srcUrl", "updateComments")

new_names <- c(
"CompoundCount", "BaseURL", "Description", "LastUpdated", "Name",
"NameLabel", "NameLong", "SourceID", "Details", "ReleaseDate",
"ReleaseNumber", "URL", "UpdateComments")

setnames(sources_dt, old_names, new_names)

new_order <- c(
"Name", "NameLabel", "NameLong", "SourceID", "CompoundCount",
"BaseURL", "URL", "Details",
"Description", "ReleaseNumber", "ReleaseDate", "LastUpdated",
"UpdateComments"
)

sources_dt[, ..new_order]

}

#' Query UniChem for a compound.
#'
#' This function queries the UniChem API for a compound based on the provided parameters.
#'
#' @param type `character` The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID".
#' @param compound `character` The compound identifier to search for.
#' @param sourceID `integer` The source ID to search for if the type is "sourceID". Defaults to NULL.
#' @param request_only `boolean` Whether to return the request only. Defaults to FALSE.
#' @param raw `boolean` Whether to return the raw response. Defaults to FALSE.
#' @param ... Additional arguments.
#'
#' @return A list with the external mappings and the UniChem mappings.
#'
#' @examples
#' queryUnichem(type = "sourceID", compound = "444795", sourceID = 22)
#'
#' @export
queryUnichem <- function(
type, compound, sourceID = NA_integer_, request_only = FALSE, raw = FALSE, ...
){
checkmate::assert_string(type)
checkmate::assert_string(compound)
checkmate::assert_integerish(sourceID)
checkmate::assertLogical(request_only)
checkmate::assertLogical(raw)

request <- .build_unichem_compound_req(type, compound, sourceID,...)
if(request_only) return(request)

response <- request |>
.perform_request() |>
.parse_resp_json()

if(raw) return(response)

if(response$response != "Success"){
.err("Unichem API request failed.")
}

# Mapping names to be consistent with other API calls
mapped_sources_dt <- .asDT(response$compounds$sources)
old_names <- c("compoundId", "shortName", "longName", "id", "url")
new_names <- c("compoundID", "Name", "NameLong", "sourceID", "sourcURL")
setnames(mapped_sources_dt, old = old_names, new = new_names)

External_Mappings <- mapped_sources_dt[, ..new_names]

UniChem_Mappings <- list(
UniChem.UCI = response$compounds$uci,
UniChem.InchiKey = response$compounds$standardInchiKey,
UniChem.Inchi = response$compounds$inchi$inchi,
UniChem.formula = response$compounds$inchi$formula,
UniChem.connections = response$compounds$inchi$connections,
UniChem.hAtoms = response$compounds$inchi$hAtoms
)

list(
External_Mappings = External_Mappings,
UniChem_Mappings = UniChem_Mappings
)

}
80 changes: 80 additions & 0 deletions R/unichem_helpers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#' Build a UniChem query URL
#'
#' This function builds a UniChem query URL based on the specified endpoint.
#'
#' @param endpoint The UniChem endpoint to query (valid options: "compounds", "connectivity", "images", "sources")
#' @param query_only Logical indicating whether to return only the query URL without building it (default: FALSE)
#'
#' @return `httr2::httr2_url` object if `query_only` is TRUE, otherwise the built URL.
#'
#' @examples
#' .build_unichem_query("sources")
#' .build_unichem_query("connectivity", query_only = TRUE)
#'
#' @noRd
#' @keywords internal
.build_unichem_query <- function(
endpoint, query_only = FALSE
) {

valid_endpoints <- c("compounds", "connectivity", "images", "sources")
checkmate::assert_subset(endpoint, valid_endpoints)

unichem_api <- "https://www.ebi.ac.uk/unichem/api/v1"
url <- httr2::url_parse(unichem_api)
url$path <- .buildURL(url$path, endpoint)

if (query_only) return(url)

return(httr2::url_build(url))
}


#' Build a UniChem compound request
#'
#' This function builds a UniChem compound request based on the provided parameters.
#'
#' @param type The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID".
#' @param compound The compound identifier to search for.
#' @param sourceID The source ID to search for if the type is "sourceID". Defaults to NULL.
#' @param ... Additional arguments.
#'
#' @return A `httr2_request` request object for the UniChem compound query.
#'
#' @examples
#' .build_unichem_compound_req(type = "uci", compound = "538323")
#' .build_unichem_compound_req(type = "sourceID", sourceID = 22, compound = "2244")
#'
#' @noRd
#' @keywords internal
.build_unichem_compound_req <- function(
type, compound, sourceID = NULL, ...
){
valid_types <- c("uci", "inchi", "inchikey", "sourceID")
checkmate::assert_subset(type, valid_types)

base_url <- .build_unichem_query("compounds")

body <- list(
type = type,
compound = compound
)

body$sourceID <- if (type == "sourceID") {
checkmate::assert_integerish(
x = sourceID,
lower = 1,
upper = max(getUnichemSources()$SourceID),
len = 1
)
sourceID
} else NULL


request <- base_url |>
.build_request() |>
httr2::req_body_json(body)

return(request)

}
29 changes: 29 additions & 0 deletions man/getUnichemSources.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions man/queryUnichem.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 73 additions & 0 deletions tests/testthat/test_unichem.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
library(testthat)
library(AnnotationGx)
library(checkmate)

test_that("getUnichemSources returns a data.table with the correct columns", {
sources <- getUnichemSources()

expected_columns <- c(
"Name", "NameLabel", "NameLong", "SourceID", "CompoundCount",
"BaseURL", "URL", "Details", "Description", "ReleaseNumber",
"ReleaseDate", "LastUpdated", "UpdateComments"
)

expect_data_table(
sources,
all.missing = FALSE,
min.rows = 40, # As of March 2024
min.cols = 13, # As of March 2024
col.names = 'named',
info = "The data.table should have the correct columns.
The min number of rows and columns may change over time and is set on
from UniChem as of March 2024.",
)
})


test_that("queryUnichem returns the expected results", {
# Test case 1
result1 <- queryUnichem(type = "sourceID", compound = "444795", sourceID = 22)
expect_true(is.list(result1))
expect_true("External_Mappings" %in% names(result1))
expect_true("UniChem_Mappings" %in% names(result1))

# Test case 2
expect_error(queryUnichem(type = "inchikey", compound = "InchiKey123"))

})

test_that("queryUnichem returns the expected results 2", {
# Test case 1
result1 <- queryUnichem(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = T)

expect_true(is.list(result1))


checkmate::expect_names(
names(result1),
subset.of=c("compounds", "notFound", "response", "totalCompounds"))

checkmate::expect_names(
names(result1$compounds),
subset.of=c("inchi", "sources", "standardInchiKey", "uci")
)

result2 <- queryUnichem(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = F)

expect_true(is.list(result2))

checkmate::expect_names(
names(result2$External_Mappings),
subset.of = c("compoundID", "Name", "NameLong", "sourceID", "sourcURL")
)

checkmate::expect_names(
names(result2$UniChem_Mappings),
subset.of = c(
"UniChem.UCI", "UniChem.InchiKey", 'UniChem.Inchi',
'UniChem.formula','UniChem.connections','UniChem.hAtoms'
)
)


})
Loading

0 comments on commit c8d88cc

Please sign in to comment.