diff --git a/NAMESPACE b/NAMESPACE index fd518b1..bf4bfaa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,7 @@ export(grep_file_names) export(httpRequestUniChem) export(infer_gencode_type) export(listColToDT) +export(mapCellosaursAccessionsToFields) export(moleculeQuery) export(parseJSON) export(postRequestUniChem) diff --git a/R/getCellosaurusAPI.R b/R/getCellosaurusAPI.R index 1b07fed..3fe34d8 100644 --- a/R/getCellosaurusAPI.R +++ b/R/getCellosaurusAPI.R @@ -1,32 +1,39 @@ +#' Construct Cellosaurus API URL +#' @keywords internal +.constructCellosaurusURL <- function(query, from, extResource, to, numResults) { + cellosaurus_api_url <- "https://api.cellosaurus.org/" + resource <- "search/cell-line?" + + if (from == "dr") { + checkmate::assert_character(extResource) + from <- paste0(from, ":", extResource) + q <- paste0("q=", from, ";", query) + } else { + q <- paste0("q=", from, ":", query) + } + url <- paste0( + cellosaurus_api_url, resource, + q, + "&format=tsv", + "&fields=", paste(to, collapse = ","), + "&rows=", numResults + ) - -# Allowed Fields: -#' id, sy, idsy, ac, acas, dr, ref, rx, ra, rt, rl, -#' ww, genome-ancestry, hla, registration, sequence-variation, -#' anecdotal, biotechnology, breed, caution, cell-type, -#' characteristics, donor-info, -#' derived-from-site, discontinued, doubling-time, from, group, -#' karyotype, knockout, msi, miscellaneous, misspelling, mab-isotype, -#' mab-target, omics, part-of, population, problematic, resistance, -#' senescence, transfected, transformant, virology, -#' cc, str, di, din, dio, ox, sx, ag, oi, hi, ch, ca, dt, dtc, dtu, dtv + return(url) +} #' Search Cellosaurus API #' #' This function searches the Cellosaurus API for cell line information based on the provided query. #' -#' @param query The query string to search for. +#' @param query The query string to search for in the Cellosaurus API. #' @param from The field to search in. Default is "id". -#' @param extResource The external resource to search in, only applicable when from is "dr". -#' @param format The format of the response. Only "txt" format is currently supported. -#' @param to The fields to include in the response. The default is : -#' "id", "ac", "sy", "ca", "sx", "ag", "din", "derived-from-site", "misspelling") +#' @param extResource The external resource to search in. Default is NULL. +#' @param to The fields to return in the response. Default is "id". #' @param numResults The number of results to return. Default is 1. -#' @param returnURL Logical indicating whether to return the constructed URL instead of making the API request. Default is FALSE. -#' -#' @return A data frame containing the search results. +#' @param returnURL Logical indicating whether to return the constructed API URL. Default is FALSE. #' #' @examples #' searchCellosaurusAPI("MCF-7") @@ -36,52 +43,27 @@ searchCellosaurusAPI <- function( query, from = "id", extResource = NULL, - format = "tsv", to = c("id", "ac", "ca", "sx", "ag", "di", "derived-from-site", "misspelling"), numResults = 1, returnURL = FALSE ){ - if(!format %in% c("txt","tsv")) stop("Only txt,tsvformat is currently supported") - checkmate::assert_character(from) checkmate::assert_character(query) - checkmate::assert_character(format) - - cellosaurus_api_url <- "https://api.cellosaurus.org/" - resource <- "search/cell-line?" - - if(from == "dr"){ - checkmate::assert_character(extResource) - from <- paste0(from, ":", extResource) - q <- paste0("q=", from, ";", query) - }else{ - q <- paste0("q=", from, ":", query) - } - - url <- paste0( - cellosaurus_api_url, resource, - q, - "&format=", format, - "&fields=", paste(to, collapse=","), - "&rows=", numResults) + url <- .constructCellosaurusURL(query, from, extResource, to, numResults) if(returnURL) return(url) - response <- httr::GET(URLencode(url)) - - if(format == "tsv"){ - response <- httr::content(response, "text") - response <- data.table::fread(text = response, sep = "\t") - response <- .parseCellosaurusTSVResponse(response) - }else if(format == "txt"){ - response <- httr::content(response) - response <- .parseCellosaurusTXTResponse(response) - } - response <- response[, "queryField" := query] + response <- httr::RETRY( + "GET", URLencode(url), times = 5, quiet = TRUE + ) + response <- httr::content(response, "text") + response <- data.table::fread(text = response, sep = "\t") + response <- .parseCellosaurusTSVResponse(response) + + response$queryField <- query return(response) } - #' Parse Cellosaurus TSV Response #' #' This function parses the response from the Cellosaurus API and converts it into a data.table. @@ -92,6 +74,7 @@ searchCellosaurusAPI <- function( #' #' @export .parseCellosaurusTSVResponse <- function(response) { + lookup <- c( "id" = "Name", "ac" = "Accession", @@ -107,7 +90,7 @@ searchCellosaurusAPI <- function( "hi" = "ParentCellLine", "dt" = "Date" ) - + result <- lapply( names(response), function(x) { @@ -118,6 +101,7 @@ searchCellosaurusAPI <- function( data.table::as.data.table(t(unlist(result))) } + #' getCellosaurusAccesions Function #' #' This function retrieves Cellosaurus accessions for a given set of samples. @@ -132,16 +116,19 @@ searchCellosaurusAPI <- function( #' #' @export getCellosaurusAccesions <- function(samples, from = "idsy", ..., threads=1){ + BPPARAM <- BiocParallel::MulticoreParam( + workers = threads, + progressbar = TRUE) + results <- BiocParallel::bplapply(samples, function(sampleID){ searchCellosaurusAPI( query = sampleID, - from = from, - to = c("ac"), - format = "txt", + from = from, + to=c("id", "ac"), ... ) }, - BPPARAM = BiocParallel::MulticoreParam(workers = threads) + BPPARAM = BPPARAM ) # if results is a list of URLS return @@ -151,200 +138,49 @@ getCellosaurusAccesions <- function(samples, from = "idsy", ..., threads=1){ results } -#' Parses the response from the Cellosaurus API. + +#' Maps Cellosaurus accessions to specified fields #' -#' This function takes the response from the Cellosaurus API and parses it into a structured format. -#' It removes unnecessary header lines, splits the response into individual records, and extracts the relevant fields. -#' The resulting data is returned as a data.table object. +#' This function takes a vector of Cellosaurus accessions and a vector of fields, +#' and retrieves the corresponding information from the Cellosaurus API. It uses +#' multiple cores for parallel processing, with the number of threads specified +#' by the 'threads' parameter. #' -#' @param response The response from the Cellosaurus API. -#' @return A data.table object containing the parsed data. -.parseCellosaurusTXTResponse <- function(response){ - response <- strsplit(response, split = "\n")[[1]][-(1:15)] - response <- split(response, cumsum(response == "//")) - response <- lapply(response, function(x) x[x != "//"]) - response <- unname(response[lengths(response) > 0]) - response <- lapply(response, function(i){ - unlist(lapply(i, .parseCellosarusField))}) - - response - data.table::rbindlist(lapply(response, function(i) { - temp <- t(i) +#' @param accessions A vector of Cellosaurus accessions. +#' @param fields A vector of fields to retrieve for each accession. +#' @param threads The number of threads to use for parallel processing. +#' +#' @return A data.table containing the retrieved information for each accession. +#' +#' @export +mapCellosaursAccessionsToFields <- function(accessions, fields, threads=1){ + BPPARAM <- BiocParallel::MulticoreParam( + workers = threads, + progressbar = TRUE) - temp <- data.table::as.data.table(temp, keep.rownames = TRUE) - # if there are multiple columns with the same name, - # collapse them into a single column separated by "; " - }), - fill=TRUE) -} -.parseCellosarusField <- function(field) { - res <- strsplit(field, split = " ")[[1]] - setNames(list(res[2]), res[1]) + results <- BiocParallel::bplapply(accessions, function(accession){ + searchCellosaurusAPI( + query = accession, + from = "ac", + to = fields + ) + }, + BPPARAM = BPPARAM + ) + results <- data.table::rbindlist(results, fill = TRUE) + results } +# Write testing code here, this is only executed if the file is run as a script +# It is equivalent to if __name__ == "__main__" in Python +# What it actually does is count the number of stack frames +if (sys.nframe() == 0) { + samples <- c("HeLa", "22rv1") -# #' Create a list of query URLS for Cellosaurus API -# #' -# #' @description -# #' This function creates a queryURL for the cellosaurus API using a list of cell line names -# #' -# #' @details -# #' Function to create a URL query for Cellosaurus to search for a cell-line using its name -# #' An example call: computedURLs <- .createQueryURLs(api = "https://api.cellosaurus.org/", -# #' cl_names = c("22rv1", "Hela"), fields = c("id", "ac")) -# #' @return A list of URLS -# #' @param api is the link to the API to build the URL. i.e "https://api.cellosaurus.org/" -# #' @param cl_names is a list of the cell line names -# #' @param format is the type of format to return from the API. Can be "txt" or "json" -# #' @param num_results is the number of of items to return, DEFAULT=1 -# #' @param GETfxn is the function to use on the cellosaurus website. Currently only supports "search/cell-line?" -# #' @param fields is a list of desired fields to include in the response -# #' -# #' @md -# #' @export -# .createQueryURLs <- -# function(api = "https://api.cellosaurus.org/", -# cl_names, -# format = "txt", -# num_results = 1, -# GETfxn = c("search/cell-line?", "cell-line/"), -# fields, -# q = "idsy:") { - -# if (GETfxn == "search/cell-line?") { -# # create urls -# computedURLs <- paste0( -# api, -# GETfxn, -# "q=", q, -# gsub(" ", "%20",cl_names), -# "&rows=", num_results, -# "&format=", format, -# "&fields=", paste(fields, collapse=",") -# ) -# return(computedURLs) -# } else if (GETfxn == "cell-line/") { -# computedURLs <- paste0( -# api, -# GETfxn, -# gsub(" ", "%20",cl_names), -# "?", -# "format=",format, -# "&fields=", paste(fields, collapse=",") -# ) -# return(computedURLs) -# } else { -# stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'") -# } - -# return(computedURLs) -# } - -# #' Query Cellosaurus -# #' -# #' @description -# #' This function takes a list of cell line names and interested fields and gets responses from the Cellosaurus API -# #' -# #' @details -# #' Function to get responses from Cellosaurus API -# #' -# #' @return A list of responses -# #' @param cl_names is a list of the cell line names -# #' @param fields is a list of desired fields to obtain for each cell line in the API query, i.e if only trying to get synonynms and primary accesssion then fields=c("sy", "ac"). see https://api.cellosaurus.org/static/fields_help.html for all fields. -# #' -# #' @md -# #' @export -# #' -# getCellosaurusAPI <- -# function( -# cl_names, # List of cell line names -# fields = c( -# "id", # Recommended name. Most frequently the name of the cell line as provided in the original publication. -# "ac", # Primary accession. It is the unique identifier of the cell line. It is normally stable across Cellosaurus versions ... -# "sy", # List of synonyms. -# "misspelling", # Identified misspelling(s) of the cell line name -# "din", # Disease(s) suffered by the individual from which the cell line originated with its NCI Thesaurus or ORDO identifier. -# "ca", # Category to which a cell line belongs, one of 14 defined terms. Example: cancer cell line, hybridoma, transformed cell line. -# "sx", # Sex -# "ag", # Age at sampling time of the individual from which the cell line was established. -# "sampling-site", # Body part, organ, cell-type the cell line is derived from -# "metastatic-site" # Body part, organ the cell line is derived from in the case of a cell line originating from a cancer metastasis. -# ), -# GETfxn = c("search/cell-line?", "cell-line/"), # Function to use on the cellosaurus website -# querydomain = "ac:" -# ){ -# cellosaurus_api_url <- "https://api.cellosaurus.org/" - -# computedURLs <- .createQueryURLs(api = cellosaurus_api_url, GETfxn = GETfxn, cl_names = cl_names, fields = fields, q = querydomain) - -# responseList <- BiocParallel::bplapply(computedURLs, function(x) GET(x)) -# names(responseList) <- cl_names -# return(responseList) -# } - -# #' Clean cellosaurus responses -# #' -# #' @description -# #' This function takes a list of Cellosaurus Responses and cleans them for use -# #' -# #' @details This function takes a list of Cellosaurus Responses and cleans them for use -# #' @return A list of responses -# #' @param responseList is a list of responses -# #' -# #' @md -# #' @export -# #' -# cleanCellosaurusResponse <- -# function( -# responseList, -# GETfxn = c("search/cell-line?", "cell-line/") -# ){ -# # Get content of each response, then separate content on newline character -# responseContent <- lapply(lapply(responseList, httr::content), -# function(x) strsplit(x=x, split="\n")) - -# if (GETfxn == "search/cell-line?") { -# #Remove first 15 rows of content -# responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)]) -# # Split on first " " appearance -# responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x, split = " .")) -# # rbind responses (do.call returns as one large matrix instead of dataframe) -# df_ <- lapply(responseContent_sub_split, function(x) do.call(rbind, x)) - -# # convert each response from matrix to data.table -# df_2 <- lapply(df_, function(x) data.table(x)) - -# # rbinds all responses (creates new column so all of cell line x will have its name in col cellLine) -# df_3<- rbindlist(df_2, idcol = "cellLine") -# df_3 <- df_3[V1!="//"] - -# # Collapse all rows with the same cellLine & V1 (most often rows for cc) and separate by "; " -# df_4 <- df_3[, list(data = paste0(unique(na.omit(V2)), collapse ="; ")), by = c("cellLine", "V1")] - -# # transpose -# df_5 <- data.table::dcast(df_4, cellLine ~ ...) - -# return(df_5) -# } else if (GETfxn == "cell-line/") { -# #Remove first 15 rows of content -# # responseContent_sub <- lapply(responseContent, function(x) x[[1]][-(1:15)]) -# responseContent_sub <- responseContent -# responseContent_sub_split <- lapply(responseContent_sub, function(x) strsplit(x[[1]], split = " .")) - -# result <- rbindlist(lapply(responseContent_sub_split, function(x) { -# # remove the entire column if any of the elements has "//" in it -# if (any(grepl("//", x))) { -# x <- x[, -1] -# } - -# cvcl_dt <- as.data.table(x) -# names(cvcl_dt) <- as.character(cvcl_dt[1]) -# cvcl_dt[2] -# })) -# return(result) + # getCellosaurusAccesions(samples, threads = 2) + result <- searchCellosaurusAPI("HeLa", returnURL = F) + # print(result) + print(result) +} -# } else { -# stop("GETfxn must be either 'search/cell-line?' or 'cell-line/'") -# } -# } \ No newline at end of file diff --git a/man/dot-constructCellosaurusURL.Rd b/man/dot-constructCellosaurusURL.Rd new file mode 100644 index 0000000..b25e3d1 --- /dev/null +++ b/man/dot-constructCellosaurusURL.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getCellosaurusAPI.R +\name{.constructCellosaurusURL} +\alias{.constructCellosaurusURL} +\title{Construct Cellosaurus API URL} +\usage{ +.constructCellosaurusURL(query, from, extResource, to, numResults) +} +\description{ +Construct Cellosaurus API URL +} +\keyword{internal} diff --git a/man/dot-parseCellosaurusTXTResponse.Rd b/man/dot-parseCellosaurusTXTResponse.Rd deleted file mode 100644 index f6e4e2d..0000000 --- a/man/dot-parseCellosaurusTXTResponse.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getCellosaurusAPI.R -\name{.parseCellosaurusTXTResponse} -\alias{.parseCellosaurusTXTResponse} -\title{Parses the response from the Cellosaurus API.} -\usage{ -.parseCellosaurusTXTResponse(response) -} -\arguments{ -\item{response}{The response from the Cellosaurus API.} -} -\value{ -A data.table object containing the parsed data. -} -\description{ -This function takes the response from the Cellosaurus API and parses it into a structured format. -It removes unnecessary header lines, splits the response into individual records, and extracts the relevant fields. -The resulting data is returned as a data.table object. -} diff --git a/man/mapCellosaursAccessionsToFields.Rd b/man/mapCellosaursAccessionsToFields.Rd new file mode 100644 index 0000000..b649006 --- /dev/null +++ b/man/mapCellosaursAccessionsToFields.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getCellosaurusAPI.R +\name{mapCellosaursAccessionsToFields} +\alias{mapCellosaursAccessionsToFields} +\title{Maps Cellosaurus accessions to specified fields} +\usage{ +mapCellosaursAccessionsToFields(accessions, fields, threads = 1) +} +\arguments{ +\item{accessions}{A vector of Cellosaurus accessions.} + +\item{fields}{A vector of fields to retrieve for each accession.} + +\item{threads}{The number of threads to use for parallel processing.} +} +\value{ +A data.table containing the retrieved information for each accession. +} +\description{ +This function takes a vector of Cellosaurus accessions and a vector of fields, +and retrieves the corresponding information from the Cellosaurus API. It uses +multiple cores for parallel processing, with the number of threads specified +by the 'threads' parameter. +} diff --git a/man/searchCellosaurusAPI.Rd b/man/searchCellosaurusAPI.Rd index d6b2391..c475849 100644 --- a/man/searchCellosaurusAPI.Rd +++ b/man/searchCellosaurusAPI.Rd @@ -2,45 +2,29 @@ % Please edit documentation in R/getCellosaurusAPI.R \name{searchCellosaurusAPI} \alias{searchCellosaurusAPI} -\title{id, sy, idsy, ac, acas, dr, ref, rx, ra, rt, rl, -ww, genome-ancestry, hla, registration, sequence-variation, -anecdotal, biotechnology, breed, caution, cell-type, -characteristics, donor-info, -derived-from-site, discontinued, doubling-time, from, group, -karyotype, knockout, msi, miscellaneous, misspelling, mab-isotype, -mab-target, omics, part-of, population, problematic, resistance, -senescence, transfected, transformant, virology, -cc, str, di, din, dio, ox, sx, ag, oi, hi, ch, ca, dt, dtc, dtu, dtv -Search Cellosaurus API} +\title{Search Cellosaurus API} \usage{ searchCellosaurusAPI( query, from = "id", extResource = NULL, - format = "tsv", to = c("id", "ac", "ca", "sx", "ag", "di", "derived-from-site", "misspelling"), numResults = 1, returnURL = FALSE ) } \arguments{ -\item{query}{The query string to search for.} +\item{query}{The query string to search for in the Cellosaurus API.} \item{from}{The field to search in. Default is "id".} -\item{extResource}{The external resource to search in, only applicable when from is "dr".} +\item{extResource}{The external resource to search in. Default is NULL.} -\item{format}{The format of the response. Only "txt" format is currently supported.} - -\item{to}{The fields to include in the response. The default is : -"id", "ac", "sy", "ca", "sx", "ag", "din", "derived-from-site", "misspelling")} +\item{to}{The fields to return in the response. Default is "id".} \item{numResults}{The number of results to return. Default is 1.} -\item{returnURL}{Logical indicating whether to return the constructed URL instead of making the API request. Default is FALSE.} -} -\value{ -A data frame containing the search results. +\item{returnURL}{Logical indicating whether to return the constructed API URL. Default is FALSE.} } \description{ This function searches the Cellosaurus API for cell line information based on the provided query. diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..5452c81 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(AnnotationGx) +library(testthat) + +test_check("AnnotationGx") \ No newline at end of file diff --git a/tests/testthat/test_getCellosaurusAPI.R b/tests/testthat/test_getCellosaurusAPI.R new file mode 100644 index 0000000..53f8a24 --- /dev/null +++ b/tests/testthat/test_getCellosaurusAPI.R @@ -0,0 +1,34 @@ +library(testthat) + + +test_that("searchCellosaurusAPI returns expected results", { + # Test case 1: Search for cell line "MCF-7" + result1 <- searchCellosaurusAPI("MCF-7") + expect_equal(nrow(result1), 1) + expect_true(grepl("MCF-7", result1$Name)) + + # Test case 2: Search for cell line "HeLa" + result2 <- searchCellosaurusAPI("HeLa") + expect_equal(nrow(result2), 1) + # expect "HeLa" in the name + expect_true(grepl("HeLa", result2$Name)) +}) + +test_that("getCellosaurusAccesions returns expected results", { + # Test case 1: Retrieve Cellosaurus accessions for sample IDs "HeLa" and "22rv1" + samples <- c("HeLa", "MCF-7") + result <- getCellosaurusAccesions(samples) + expect_equal(nrow(result), 2) + expect_true(grepl("HeLa", result$Name[1])) + expect_true(grepl("MCF-7", result$Name[2])) +}) + +test_that("mapCellosaursAccessionsToFields returns expected results", { + # Test case 1: Map Cellosaurus accessions to fields "id" and "ac" + accessions <- c("CVCL_0031", "CVCL_0030") + fields <- c("id", "ac") + result <- mapCellosaursAccessionsToFields(accessions, fields) + expect_equal(nrow(result), 2) + expect_equal(result$Accession, c("CVCL_0031", "CVCL_0030")) +}) +