Add data.table package import and update tests***

***Update mapCell2Accession function documentation*** ***Add common_cellosaurus_fields and cellosaurus_extResources tests*** ***Update Cellosaurus vignette with examples*** ***Refactor code in vignette to improve readability
bhklab · Mar 2, 2024 · c2da4a4 · c2da4a4
1 parent af45820
commit c2da4a4
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 23 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(getPubchemCompound)
 export(mapCell2Accession)
 export(standardize_names)
 import(BiocParallel)
+import(data.table)
 importFrom(checkmate,assert)
 importFrom(checkmate,assert_atomic)
 importFrom(checkmate,assert_choice)

diff --git a/R/cellosaurus.R b/R/cellosaurus.R
@@ -19,7 +19,6 @@
 #'                        to "BT474 A3" whereas "BT-474" exists in the database as "CVCL_0179". If prioritizeParent is TRUE,
 #'                        the function will prioritize "CVCL_0179" over "CVCL_YX79" since "BT-474" is the parent cell line of 
 #'                        "BT474 A3".
-#' @param orderby The field to order the results by. Default is "ac" to order by accession number. 
 #' @param ... Additional arguments to pass to the request.
 #'
 #' @return Depending on parameters, either a:
@@ -32,7 +31,7 @@
 #' @export
 mapCell2Accession <- function(
     ids, numResults = 1000, from = "id", to = c("id", "ac"),
-    prioritizeParent = FALSE, orderby = "ac",
+    prioritizeParent = FALSE, 
     query_only = FALSE, raw = FALSE, BPPARAM = BiocParallel::SerialParam(), ...
 ) {
 
@@ -85,17 +84,55 @@ mapCell2Accession <- function(
 
     if(!prioritizeParent) return(responses_dt)
     if(all(is.na(responses_dt$hi))) return(responses_dt)
+
+    if((prioritizeParent) && from != "id") .err("Prioritize parent is only available when querying from 'id'")
 
-    return(.prioritize_parent(responses_dt))
+    return(.prioritize_parent(responses_dt, numResults))
 }
 
-
-.prioritize_parent <- function(responses_dt) {
-    responses_dt[, c("parentAC", "parentID") := tstrsplit(hi, "!", fixed = TRUE)]
+#' @import data.table
+#' 
+#' @keywords internal
+#' @noRd
+.prioritize_parent <- function(responses_dt, numResults ) {
+    responses_dt[, c("parentAC", "parentID") := data.table::tstrsplit(hi, " ! ", fixed = TRUE)]
     responses_dt <- responses_dt[, -"hi"]
 
     if(all(is.na(responses_dt$parentAC))) return(responses_dt[, -c("parentAC", "parentID")])
 
     parentACs <- na.omit(unique(responses_dt$parentAC))
+    columns <- names(responses_dt)
+
+    responses_dt <- 
+        if(all(parentACs %in% responses_dt$ac)) {
+            # if so, move all the rows that are parents to the top
+            parentRows <- responses_dt$ac %in% parentACs
+
+            parentDT <- responses_dt[parentRows, ]
+            childDT <- responses_dt[!parentRows, ]
+            rbind(parentDT, childDT)
+
+        } else{
+            # add the parentAC and parentID pairs to the top of the table
+
+            new_rows <- unique(
+                responses_dt[parentAC %in% parentACs[!parentACs %in% responses_dt$ac], .(ac = parentAC, id = parentID, query = query, `query:id` = `query:id`)]
+            )
+            parent_rows <- responses_dt[parentAC %in% parentACs,]
+            child_rows <- responses_dt[!parentAC %in% parentACs,]
+            new_dt <- data.table::rbindlist(list(parent_rows, new_rows, child_rows), use.names=TRUE, fill=TRUE)
+            new_dt[]
+        }
+    # groupby query and query:id 
+    # for each group, sort by the highest number of parentAC counts
+    data.table::setorderv(responses_dt, c("query","ac"))
+    responses_dt[, c("parentAC", "parentID") := NULL]
+
+    # only return numResults rows for each group by query
+    responses_dt <- responses_dt[, .SD[1:min(.N, numResults)], by = .(query)]
+
+    # reorder the columns
+    responses_dt <- responses_dt[, c("id", "ac", "query", "query:id")]
 
+    return(na.omit(responses_dt[]))
 }
diff --git a/R/cellosaurus_helpers.R b/R/cellosaurus_helpers.R
@@ -83,11 +83,33 @@
 #  CA         Category                        Once
 #  DT         Date (entry history)            Once
 #  //         Terminator                      Once; ends an entry
+
+#' Get the list of fields in the Cellosaurus schema
+#'
+#' This function retrieves the list of fields available in the Cellosaurus schema.
+#' It internally calls the `.get_cellosaurus_schema()` function to fetch the schema
+#' and extracts the list of fields from it.
+#'
+#' @return A character vector containing the list of fields in the Cellosaurus schema.
+#'
+#' @keywords internal
+#' @noRd
 .cellosaurus_fields <- function(){
     schema <- .get_cellosaurus_schema()
     schema$components$schemas$Fields$enum
 }
 
+#' Get the Cellosaurus schema
+#'
+#' This function retrieves the Cellosaurus schema from the Cellosaurus API.
+#' It internally calls the `.buildURL()`, `.build_request()`, `.perform_request()`,
+#' and `.parse_resp_json()` functions to construct the API URL, send the request,
+#' and parse the response.
+#'
+#' @return A list representing the Cellosaurus schema.
+#'
+#' @keywords internal
+#' @noRd
 .get_cellosaurus_schema <- function(){
     url <- .buildURL("https://api.cellosaurus.org/openapi.json")
     request <- .build_request(url)
@@ -98,7 +120,6 @@
 
 
 
-
 #  ---------  --------------------------------------  -------------------------------------------------
 #  Line code  Content                                 Description
 #  ---------  --------------------------------------  -------------------------------------------------
@@ -129,6 +150,12 @@
 #  KARY       Karyotype                               Information relevant to the chromosomes of a cell line (often to describe chromosomal abnormalities).
 #  KO         Knockout                                Gene(s) knocked-out in the cell line and method to obtain the KO.
 #  //         Terminator                             Once; ends an entry
+
+
+#' Internal function to return the list of fields available in Cellosaurus
+#' 
+#' @keywords internal
+#' @noRd
 .common_cellosaurus_fields <- function(){
     c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI",
     "HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP",
@@ -157,7 +184,11 @@
 
 # Cell_Model_Passport, DepMap, ATCC, Cosmic, Cosmic-CLP
 
-
+#' Internal function to return the list of external resources available in Cellosaurus
+#' @return A character vector of external resources available in Cellosaurus
+#' 
+#' @keywords internal
+#' @noRd
 .cellosaurus_extResources <- function(){
     c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress",
     "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources",

diff --git a/man/mapCell2Accession.Rd b/man/mapCell2Accession.Rd
diff --git a/tests/testthat/test_cellosaurus.R b/tests/testthat/test_cellosaurus.R
@@ -38,8 +38,13 @@ test_that("mapCell2Accession prioritizePatient works as expected",{
 
     result1 <- mapCell2Accession(cell_line,from="id", BPPARAM = BiocParallel::SerialParam(), numResults=1, prioritizeParent = TRUE)
 
-    expect_data_table(result1, nrows=2, ncols = 4)
+    expect_data_table(result1, nrows=1, ncols = 4)
     expect_named(result1,  c("id", "ac", "query", "query:id"))
+
+
+    # cant prioritizeParent if from != "id".. yet
+    expect_error(mapCell2Accession("BT474", numResults=1, from="idsy",  prioritizeParent=T))
+
 })
 
 

diff --git a/tests/testthat/test_cellosaurus_helpers.R b/tests/testthat/test_cellosaurus_helpers.R
@@ -46,3 +46,40 @@ test_that(".build_cellosaurus_request is acting as expected",{
     expect_equal(nrow(response), 2)
 
 })
+test_that(".common_cellosaurus_fields returns the expected fields", {
+  fields <- AnnotationGx:::.common_cellosaurus_fields()
+  expect_character(fields)
+
+  expected_fields <- c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI",
+                      "HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP",
+                      "KARY", "KO")
+
+  expect_equal(fields, expected_fields)
+})
+test_that(".cellosaurus_extResources returns the expected external resources", {
+  resources <- AnnotationGx:::.cellosaurus_extResources()
+  expect_character(resources)
+
+  expected_resources <- c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress",
+    "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources",
+    "BioGRID_ORCS_Cell_line", "BTO", "BioSample", "BioSamples",
+    "cancercelllines", "CancerTools", "CBA", "CCLV", "CCRID",
+    "CCTCC", "Cell_Biolabs", "Cell_Model_Passport", "CGH-DB",
+    "ChEMBL-Cells", "ChEMBL-Targets", "CLDB", "CLO", "CLS",
+    "ColonAtlas", "Coriell", "Cosmic", "Cosmic-CLP", "dbGAP",
+    "dbMHC", "DepMap", "DGRC", "DiscoverX", "DSHB", "DSMZ",
+    "DSMZCellDive", "EBiSC", "ECACC", "EFO", "EGA", "ENCODE",
+    "ESTDAB", "FCDI", "FCS-free", "FlyBase_Cell_line", "GDSC",
+    "GeneCopoeia", "GEO", "HipSci", "HIVReagentProgram", "Horizon_Discovery",
+    "hPSCreg", "IARC_TP53", "IBRC", "ICLC", "ICLDB", "IGRhCellID",
+    "IGSR", "IHW", "Imanis", "Innoprot", "IPD-IMGT/HLA", "ISCR",
+    "IZSLER", "JCRB", "KCB", "KCLB", "Kerafast", "KYinno", "LiGeA",
+    "LIMORE", "LINCS_HMS", "LINCS_LDP", "Lonza", "MCCL", "MeSH",
+    "MetaboLights", "Millipore", "MMRRC", "NCBI_Iran", "NCI-DTP", "NHCDR",
+    "NIHhESC", "NISES", "NRFC", "PerkinElmer", "PharmacoDB", "PRIDE",
+    "Progenetix", "PubChem_Cell_line", "RCB", "Rockland", "RSCB", "SKIP",
+    "SKY/M-FISH/CGH", "SLKBase", "TKG", "TNGB", "TOKU-E", "Ubigene",
+    "WiCell", "Wikidata", "Ximbio")
+
+  expect_equal(resources, expected_resources)
+})
diff --git a/vignettes/Cellosaurus.Rmd b/vignettes/Cellosaurus.Rmd
@@ -73,15 +73,15 @@ This is the case if we query for "A549" in the same way.
 # our query:
 mapCell2Accession("A549", from = "id", numResults=1)
 
-# the standard name:
+# if we use the actual standard name:
 mapCell2Accession("A-549", from = "id", numResults=1)
 
 # trying to get the standard name with more results
 mapCell2Accession("A549", from = "id", numResults=10)
 ```
 
 The main identifier for the A549 cell line is "A-549" with accesion id of "CVCL_0023" 
-which does not appear in even the first 10 results (note: it would be the 40th result 
+which does not appear in even the first 10 results (it would be the 40th result 
 if we adjust the `numResults` parameter) and while we could manually adjust every query 
 to match the standard name, this is not feasible for hundreds of cell lines. 
 
@@ -95,34 +95,34 @@ results based on the accession id of the parent cell line.
 mapCell2Accession("A549", from = "id", numResults=1, prioritizeParent=TRUE)
 ```
 
+Another example using the BT474 cell line. 
 
 ``` {r map BT474 each}
-mapCell2Accession("BT474", from = "id", numResults=1, prioritizeParent=TRUE)
+mapCell2Accession("BT474", numResults=1, prioritizeParent=FALSE)
+
+
+mapCell2Accession("BT474", numResults=1, prioritizeParent=TRUE)
 ```
 
 
 
 
 
-### Setup
+### Example pipeline to annotate a dataset
 We will be working with some data from the GDSC and Cell Model Passports datasets 
 for this vignette. The GDSC dataset contains information about the cell lines 
 in the Genomics of Drug Sensitivity in Cancer (GDSC) project. The Cell Model
 Passports dataset contains information about all the models in the the database.
 
 
-
-
-
-
 The GDSC sampleMetadata contains two columns, `GDSC.Sample_Name` and `GDSC.COSMIC_ID`.
 ``` {r setup data}
 data(gdsc_sampleMetadata)
 data(cell_model_passports_models)
 ```
 
 ```{r view data}
-head(gdsc_sampleMetadata)
+# head(gdsc_sampleMetadata)
 
 ```
 
@@ -135,13 +135,13 @@ head(gdsc_sampleMetadata)
 
 
 ```{r view data2}
-head(cell_model_passports_models)
+# head(cell_model_passports_models)
 
 ```
 
 
 ```{r map one name to fields}
-fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site")
+# fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site")
 
 # By passing the fields argument, the function will return the information for the fields specified
 # AnnotationGx::mapCell2Accession(cell_model_passports_models[["CMP.model_name"]], to=fields) |> head()