diff --git a/NAMESPACE b/NAMESPACE index 28efb63..343da6d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(getPubchemCompound) export(mapCell2Accession) export(standardize_names) import(BiocParallel) +import(data.table) importFrom(checkmate,assert) importFrom(checkmate,assert_atomic) importFrom(checkmate,assert_choice) diff --git a/R/cellosaurus.R b/R/cellosaurus.R index 8447a5d..195780b 100644 --- a/R/cellosaurus.R +++ b/R/cellosaurus.R @@ -19,7 +19,6 @@ #' to "BT474 A3" whereas "BT-474" exists in the database as "CVCL_0179". If prioritizeParent is TRUE, #' the function will prioritize "CVCL_0179" over "CVCL_YX79" since "BT-474" is the parent cell line of #' "BT474 A3". -#' @param orderby The field to order the results by. Default is "ac" to order by accession number. #' @param ... Additional arguments to pass to the request. #' #' @return Depending on parameters, either a: @@ -32,7 +31,7 @@ #' @export mapCell2Accession <- function( ids, numResults = 1000, from = "id", to = c("id", "ac"), - prioritizeParent = FALSE, orderby = "ac", + prioritizeParent = FALSE, query_only = FALSE, raw = FALSE, BPPARAM = BiocParallel::SerialParam(), ... ) { @@ -85,17 +84,55 @@ mapCell2Accession <- function( if(!prioritizeParent) return(responses_dt) if(all(is.na(responses_dt$hi))) return(responses_dt) + + if((prioritizeParent) && from != "id") .err("Prioritize parent is only available when querying from 'id'") - return(.prioritize_parent(responses_dt)) + return(.prioritize_parent(responses_dt, numResults)) } - -.prioritize_parent <- function(responses_dt) { - responses_dt[, c("parentAC", "parentID") := tstrsplit(hi, "!", fixed = TRUE)] +#' @import data.table +#' +#' @keywords internal +#' @noRd +.prioritize_parent <- function(responses_dt, numResults ) { + responses_dt[, c("parentAC", "parentID") := data.table::tstrsplit(hi, " ! ", fixed = TRUE)] responses_dt <- responses_dt[, -"hi"] if(all(is.na(responses_dt$parentAC))) return(responses_dt[, -c("parentAC", "parentID")]) parentACs <- na.omit(unique(responses_dt$parentAC)) + columns <- names(responses_dt) + + responses_dt <- + if(all(parentACs %in% responses_dt$ac)) { + # if so, move all the rows that are parents to the top + parentRows <- responses_dt$ac %in% parentACs + + parentDT <- responses_dt[parentRows, ] + childDT <- responses_dt[!parentRows, ] + rbind(parentDT, childDT) + + } else{ + # add the parentAC and parentID pairs to the top of the table + + new_rows <- unique( + responses_dt[parentAC %in% parentACs[!parentACs %in% responses_dt$ac], .(ac = parentAC, id = parentID, query = query, `query:id` = `query:id`)] + ) + parent_rows <- responses_dt[parentAC %in% parentACs,] + child_rows <- responses_dt[!parentAC %in% parentACs,] + new_dt <- data.table::rbindlist(list(parent_rows, new_rows, child_rows), use.names=TRUE, fill=TRUE) + new_dt[] + } + # groupby query and query:id + # for each group, sort by the highest number of parentAC counts + data.table::setorderv(responses_dt, c("query","ac")) + responses_dt[, c("parentAC", "parentID") := NULL] + + # only return numResults rows for each group by query + responses_dt <- responses_dt[, .SD[1:min(.N, numResults)], by = .(query)] + + # reorder the columns + responses_dt <- responses_dt[, c("id", "ac", "query", "query:id")] + return(na.omit(responses_dt[])) } diff --git a/R/cellosaurus_helpers.R b/R/cellosaurus_helpers.R index cbde50b..31efdce 100644 --- a/R/cellosaurus_helpers.R +++ b/R/cellosaurus_helpers.R @@ -83,11 +83,33 @@ # CA Category Once # DT Date (entry history) Once # // Terminator Once; ends an entry + +#' Get the list of fields in the Cellosaurus schema +#' +#' This function retrieves the list of fields available in the Cellosaurus schema. +#' It internally calls the `.get_cellosaurus_schema()` function to fetch the schema +#' and extracts the list of fields from it. +#' +#' @return A character vector containing the list of fields in the Cellosaurus schema. +#' +#' @keywords internal +#' @noRd .cellosaurus_fields <- function(){ schema <- .get_cellosaurus_schema() schema$components$schemas$Fields$enum } +#' Get the Cellosaurus schema +#' +#' This function retrieves the Cellosaurus schema from the Cellosaurus API. +#' It internally calls the `.buildURL()`, `.build_request()`, `.perform_request()`, +#' and `.parse_resp_json()` functions to construct the API URL, send the request, +#' and parse the response. +#' +#' @return A list representing the Cellosaurus schema. +#' +#' @keywords internal +#' @noRd .get_cellosaurus_schema <- function(){ url <- .buildURL("https://api.cellosaurus.org/openapi.json") request <- .build_request(url) @@ -98,7 +120,6 @@ - # --------- -------------------------------------- ------------------------------------------------- # Line code Content Description # --------- -------------------------------------- ------------------------------------------------- @@ -129,6 +150,12 @@ # KARY Karyotype Information relevant to the chromosomes of a cell line (often to describe chromosomal abnormalities). # KO Knockout Gene(s) knocked-out in the cell line and method to obtain the KO. # // Terminator Once; ends an entry + + +#' Internal function to return the list of fields available in Cellosaurus +#' +#' @keywords internal +#' @noRd .common_cellosaurus_fields <- function(){ c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI", "HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP", @@ -157,7 +184,11 @@ # Cell_Model_Passport, DepMap, ATCC, Cosmic, Cosmic-CLP - +#' Internal function to return the list of external resources available in Cellosaurus +#' @return A character vector of external resources available in Cellosaurus +#' +#' @keywords internal +#' @noRd .cellosaurus_extResources <- function(){ c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress", "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources", diff --git a/man/mapCell2Accession.Rd b/man/mapCell2Accession.Rd index b5fc1c1..cfea5c4 100644 --- a/man/mapCell2Accession.Rd +++ b/man/mapCell2Accession.Rd @@ -10,7 +10,6 @@ mapCell2Accession( from = "id", to = c("id", "ac"), prioritizeParent = FALSE, - orderby = "ac", query_only = FALSE, raw = FALSE, BPPARAM = BiocParallel::SerialParam(), @@ -35,8 +34,6 @@ to "BT474 A3" whereas "BT-474" exists in the database as "CVCL_0179". If priorit the function will prioritize "CVCL_0179" over "CVCL_YX79" since "BT-474" is the parent cell line of "BT474 A3".} -\item{orderby}{The field to order the results by. Default is "ac" to order by accession number.} - \item{query_only}{If TRUE, returns the query URL instead of the results. Default is FALSE.} \item{raw}{If TRUE, returns the raw response instead of a data table. Default is FALSE.} diff --git a/tests/testthat/test_cellosaurus.R b/tests/testthat/test_cellosaurus.R index 2ab2c1c..fdfceb5 100644 --- a/tests/testthat/test_cellosaurus.R +++ b/tests/testthat/test_cellosaurus.R @@ -38,8 +38,13 @@ test_that("mapCell2Accession prioritizePatient works as expected",{ result1 <- mapCell2Accession(cell_line,from="id", BPPARAM = BiocParallel::SerialParam(), numResults=1, prioritizeParent = TRUE) - expect_data_table(result1, nrows=2, ncols = 4) + expect_data_table(result1, nrows=1, ncols = 4) expect_named(result1, c("id", "ac", "query", "query:id")) + + + # cant prioritizeParent if from != "id".. yet + expect_error(mapCell2Accession("BT474", numResults=1, from="idsy", prioritizeParent=T)) + }) diff --git a/tests/testthat/test_cellosaurus_helpers.R b/tests/testthat/test_cellosaurus_helpers.R index 810c371..a469e71 100644 --- a/tests/testthat/test_cellosaurus_helpers.R +++ b/tests/testthat/test_cellosaurus_helpers.R @@ -46,3 +46,40 @@ test_that(".build_cellosaurus_request is acting as expected",{ expect_equal(nrow(response), 2) }) +test_that(".common_cellosaurus_fields returns the expected fields", { + fields <- AnnotationGx:::.common_cellosaurus_fields() + expect_character(fields) + + expected_fields <- c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI", + "HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP", + "KARY", "KO") + + expect_equal(fields, expected_fields) +}) +test_that(".cellosaurus_extResources returns the expected external resources", { + resources <- AnnotationGx:::.cellosaurus_extResources() + expect_character(resources) + + expected_resources <- c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress", + "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources", + "BioGRID_ORCS_Cell_line", "BTO", "BioSample", "BioSamples", + "cancercelllines", "CancerTools", "CBA", "CCLV", "CCRID", + "CCTCC", "Cell_Biolabs", "Cell_Model_Passport", "CGH-DB", + "ChEMBL-Cells", "ChEMBL-Targets", "CLDB", "CLO", "CLS", + "ColonAtlas", "Coriell", "Cosmic", "Cosmic-CLP", "dbGAP", + "dbMHC", "DepMap", "DGRC", "DiscoverX", "DSHB", "DSMZ", + "DSMZCellDive", "EBiSC", "ECACC", "EFO", "EGA", "ENCODE", + "ESTDAB", "FCDI", "FCS-free", "FlyBase_Cell_line", "GDSC", + "GeneCopoeia", "GEO", "HipSci", "HIVReagentProgram", "Horizon_Discovery", + "hPSCreg", "IARC_TP53", "IBRC", "ICLC", "ICLDB", "IGRhCellID", + "IGSR", "IHW", "Imanis", "Innoprot", "IPD-IMGT/HLA", "ISCR", + "IZSLER", "JCRB", "KCB", "KCLB", "Kerafast", "KYinno", "LiGeA", + "LIMORE", "LINCS_HMS", "LINCS_LDP", "Lonza", "MCCL", "MeSH", + "MetaboLights", "Millipore", "MMRRC", "NCBI_Iran", "NCI-DTP", "NHCDR", + "NIHhESC", "NISES", "NRFC", "PerkinElmer", "PharmacoDB", "PRIDE", + "Progenetix", "PubChem_Cell_line", "RCB", "Rockland", "RSCB", "SKIP", + "SKY/M-FISH/CGH", "SLKBase", "TKG", "TNGB", "TOKU-E", "Ubigene", + "WiCell", "Wikidata", "Ximbio") + + expect_equal(resources, expected_resources) +}) \ No newline at end of file diff --git a/vignettes/Cellosaurus.Rmd b/vignettes/Cellosaurus.Rmd index 5e03443..6d7884b 100644 --- a/vignettes/Cellosaurus.Rmd +++ b/vignettes/Cellosaurus.Rmd @@ -73,7 +73,7 @@ This is the case if we query for "A549" in the same way. # our query: mapCell2Accession("A549", from = "id", numResults=1) -# the standard name: +# if we use the actual standard name: mapCell2Accession("A-549", from = "id", numResults=1) # trying to get the standard name with more results @@ -81,7 +81,7 @@ mapCell2Accession("A549", from = "id", numResults=10) ``` The main identifier for the A549 cell line is "A-549" with accesion id of "CVCL_0023" -which does not appear in even the first 10 results (note: it would be the 40th result +which does not appear in even the first 10 results (it would be the 40th result if we adjust the `numResults` parameter) and while we could manually adjust every query to match the standard name, this is not feasible for hundreds of cell lines. @@ -95,26 +95,26 @@ results based on the accession id of the parent cell line. mapCell2Accession("A549", from = "id", numResults=1, prioritizeParent=TRUE) ``` +Another example using the BT474 cell line. ``` {r map BT474 each} -mapCell2Accession("BT474", from = "id", numResults=1, prioritizeParent=TRUE) +mapCell2Accession("BT474", numResults=1, prioritizeParent=FALSE) + + +mapCell2Accession("BT474", numResults=1, prioritizeParent=TRUE) ``` -### Setup +### Example pipeline to annotate a dataset We will be working with some data from the GDSC and Cell Model Passports datasets for this vignette. The GDSC dataset contains information about the cell lines in the Genomics of Drug Sensitivity in Cancer (GDSC) project. The Cell Model Passports dataset contains information about all the models in the the database. - - - - The GDSC sampleMetadata contains two columns, `GDSC.Sample_Name` and `GDSC.COSMIC_ID`. ``` {r setup data} data(gdsc_sampleMetadata) @@ -122,7 +122,7 @@ data(cell_model_passports_models) ``` ```{r view data} -head(gdsc_sampleMetadata) +# head(gdsc_sampleMetadata) ``` @@ -135,13 +135,13 @@ head(gdsc_sampleMetadata) ```{r view data2} -head(cell_model_passports_models) +# head(cell_model_passports_models) ``` ```{r map one name to fields} -fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site") +# fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site") # By passing the fields argument, the function will return the information for the fields specified # AnnotationGx::mapCell2Accession(cell_model_passports_models[["CMP.model_name"]], to=fields) |> head()