Skip to content

Commit

Permalink
Add data.table package import and update tests***
Browse files Browse the repository at this point in the history
***Update mapCell2Accession function documentation***

***Add common_cellosaurus_fields and cellosaurus_extResources tests***

***Update Cellosaurus vignette with examples***

***Refactor code in vignette to improve readability
  • Loading branch information
jjjermiah committed Mar 2, 2024
1 parent af45820 commit c2da4a4
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 23 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(getPubchemCompound)
export(mapCell2Accession)
export(standardize_names)
import(BiocParallel)
import(data.table)
importFrom(checkmate,assert)
importFrom(checkmate,assert_atomic)
importFrom(checkmate,assert_choice)
Expand Down
49 changes: 43 additions & 6 deletions R/cellosaurus.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#' to "BT474 A3" whereas "BT-474" exists in the database as "CVCL_0179". If prioritizeParent is TRUE,
#' the function will prioritize "CVCL_0179" over "CVCL_YX79" since "BT-474" is the parent cell line of
#' "BT474 A3".
#' @param orderby The field to order the results by. Default is "ac" to order by accession number.
#' @param ... Additional arguments to pass to the request.
#'
#' @return Depending on parameters, either a:
Expand All @@ -32,7 +31,7 @@
#' @export
mapCell2Accession <- function(
ids, numResults = 1000, from = "id", to = c("id", "ac"),
prioritizeParent = FALSE, orderby = "ac",
prioritizeParent = FALSE,
query_only = FALSE, raw = FALSE, BPPARAM = BiocParallel::SerialParam(), ...
) {

Expand Down Expand Up @@ -85,17 +84,55 @@ mapCell2Accession <- function(

if(!prioritizeParent) return(responses_dt)
if(all(is.na(responses_dt$hi))) return(responses_dt)

if((prioritizeParent) && from != "id") .err("Prioritize parent is only available when querying from 'id'")

return(.prioritize_parent(responses_dt))
return(.prioritize_parent(responses_dt, numResults))
}


.prioritize_parent <- function(responses_dt) {
responses_dt[, c("parentAC", "parentID") := tstrsplit(hi, "!", fixed = TRUE)]
#' @import data.table
#'
#' @keywords internal
#' @noRd
.prioritize_parent <- function(responses_dt, numResults ) {
responses_dt[, c("parentAC", "parentID") := data.table::tstrsplit(hi, " ! ", fixed = TRUE)]
responses_dt <- responses_dt[, -"hi"]

if(all(is.na(responses_dt$parentAC))) return(responses_dt[, -c("parentAC", "parentID")])

parentACs <- na.omit(unique(responses_dt$parentAC))
columns <- names(responses_dt)

responses_dt <-
if(all(parentACs %in% responses_dt$ac)) {
# if so, move all the rows that are parents to the top
parentRows <- responses_dt$ac %in% parentACs

parentDT <- responses_dt[parentRows, ]
childDT <- responses_dt[!parentRows, ]
rbind(parentDT, childDT)

} else{
# add the parentAC and parentID pairs to the top of the table

new_rows <- unique(
responses_dt[parentAC %in% parentACs[!parentACs %in% responses_dt$ac], .(ac = parentAC, id = parentID, query = query, `query:id` = `query:id`)]
)
parent_rows <- responses_dt[parentAC %in% parentACs,]
child_rows <- responses_dt[!parentAC %in% parentACs,]
new_dt <- data.table::rbindlist(list(parent_rows, new_rows, child_rows), use.names=TRUE, fill=TRUE)
new_dt[]
}
# groupby query and query:id
# for each group, sort by the highest number of parentAC counts
data.table::setorderv(responses_dt, c("query","ac"))
responses_dt[, c("parentAC", "parentID") := NULL]

# only return numResults rows for each group by query
responses_dt <- responses_dt[, .SD[1:min(.N, numResults)], by = .(query)]

# reorder the columns
responses_dt <- responses_dt[, c("id", "ac", "query", "query:id")]

return(na.omit(responses_dt[]))
}
35 changes: 33 additions & 2 deletions R/cellosaurus_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,33 @@
# CA Category Once
# DT Date (entry history) Once
# // Terminator Once; ends an entry

#' Get the list of fields in the Cellosaurus schema
#'
#' This function retrieves the list of fields available in the Cellosaurus schema.
#' It internally calls the `.get_cellosaurus_schema()` function to fetch the schema
#' and extracts the list of fields from it.
#'
#' @return A character vector containing the list of fields in the Cellosaurus schema.
#'
#' @keywords internal
#' @noRd
.cellosaurus_fields <- function(){
schema <- .get_cellosaurus_schema()
schema$components$schemas$Fields$enum
}

#' Get the Cellosaurus schema
#'
#' This function retrieves the Cellosaurus schema from the Cellosaurus API.
#' It internally calls the `.buildURL()`, `.build_request()`, `.perform_request()`,
#' and `.parse_resp_json()` functions to construct the API URL, send the request,
#' and parse the response.
#'
#' @return A list representing the Cellosaurus schema.
#'
#' @keywords internal
#' @noRd
.get_cellosaurus_schema <- function(){
url <- .buildURL("https://api.cellosaurus.org/openapi.json")
request <- .build_request(url)
Expand All @@ -98,7 +120,6 @@




# --------- -------------------------------------- -------------------------------------------------
# Line code Content Description
# --------- -------------------------------------- -------------------------------------------------
Expand Down Expand Up @@ -129,6 +150,12 @@
# KARY Karyotype Information relevant to the chromosomes of a cell line (often to describe chromosomal abnormalities).
# KO Knockout Gene(s) knocked-out in the cell line and method to obtain the KO.
# // Terminator Once; ends an entry


#' Internal function to return the list of fields available in Cellosaurus
#'
#' @keywords internal
#' @noRd
.common_cellosaurus_fields <- function(){
c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI",
"HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP",
Expand Down Expand Up @@ -157,7 +184,11 @@

# Cell_Model_Passport, DepMap, ATCC, Cosmic, Cosmic-CLP


#' Internal function to return the list of external resources available in Cellosaurus
#' @return A character vector of external resources available in Cellosaurus
#'
#' @keywords internal
#' @noRd
.cellosaurus_extResources <- function(){
c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress",
"ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources",
Expand Down
3 changes: 0 additions & 3 deletions man/mapCell2Accession.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion tests/testthat/test_cellosaurus.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,13 @@ test_that("mapCell2Accession prioritizePatient works as expected",{

result1 <- mapCell2Accession(cell_line,from="id", BPPARAM = BiocParallel::SerialParam(), numResults=1, prioritizeParent = TRUE)

expect_data_table(result1, nrows=2, ncols = 4)
expect_data_table(result1, nrows=1, ncols = 4)
expect_named(result1, c("id", "ac", "query", "query:id"))


# cant prioritizeParent if from != "id".. yet
expect_error(mapCell2Accession("BT474", numResults=1, from="idsy", prioritizeParent=T))

})


Expand Down
37 changes: 37 additions & 0 deletions tests/testthat/test_cellosaurus_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,40 @@ test_that(".build_cellosaurus_request is acting as expected",{
expect_equal(nrow(response), 2)

})
test_that(".common_cellosaurus_fields returns the expected fields", {
fields <- AnnotationGx:::.common_cellosaurus_fields()
expect_character(fields)

expected_fields <- c("ID", "AC", "AS", "SY", "DR", "DI", "DIN", "DIO", "OX", "SX", "AG", "OI",
"HI", "CH", "CA", "CEL", "DT", "DTC", "DTU", "DTV", "DER", "FROM", "GROUP",
"KARY", "KO")

expect_equal(fields, expected_fields)
})
test_that(".cellosaurus_extResources returns the expected external resources", {
resources <- AnnotationGx:::.cellosaurus_extResources()
expect_character(resources)

expected_resources <- c("4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress",
"ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources",
"BioGRID_ORCS_Cell_line", "BTO", "BioSample", "BioSamples",
"cancercelllines", "CancerTools", "CBA", "CCLV", "CCRID",
"CCTCC", "Cell_Biolabs", "Cell_Model_Passport", "CGH-DB",
"ChEMBL-Cells", "ChEMBL-Targets", "CLDB", "CLO", "CLS",
"ColonAtlas", "Coriell", "Cosmic", "Cosmic-CLP", "dbGAP",
"dbMHC", "DepMap", "DGRC", "DiscoverX", "DSHB", "DSMZ",
"DSMZCellDive", "EBiSC", "ECACC", "EFO", "EGA", "ENCODE",
"ESTDAB", "FCDI", "FCS-free", "FlyBase_Cell_line", "GDSC",
"GeneCopoeia", "GEO", "HipSci", "HIVReagentProgram", "Horizon_Discovery",
"hPSCreg", "IARC_TP53", "IBRC", "ICLC", "ICLDB", "IGRhCellID",
"IGSR", "IHW", "Imanis", "Innoprot", "IPD-IMGT/HLA", "ISCR",
"IZSLER", "JCRB", "KCB", "KCLB", "Kerafast", "KYinno", "LiGeA",
"LIMORE", "LINCS_HMS", "LINCS_LDP", "Lonza", "MCCL", "MeSH",
"MetaboLights", "Millipore", "MMRRC", "NCBI_Iran", "NCI-DTP", "NHCDR",
"NIHhESC", "NISES", "NRFC", "PerkinElmer", "PharmacoDB", "PRIDE",
"Progenetix", "PubChem_Cell_line", "RCB", "Rockland", "RSCB", "SKIP",
"SKY/M-FISH/CGH", "SLKBase", "TKG", "TNGB", "TOKU-E", "Ubigene",
"WiCell", "Wikidata", "Ximbio")

expect_equal(resources, expected_resources)
})
22 changes: 11 additions & 11 deletions vignettes/Cellosaurus.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ This is the case if we query for "A549" in the same way.
# our query:
mapCell2Accession("A549", from = "id", numResults=1)
# the standard name:
# if we use the actual standard name:
mapCell2Accession("A-549", from = "id", numResults=1)
# trying to get the standard name with more results
mapCell2Accession("A549", from = "id", numResults=10)
```

The main identifier for the A549 cell line is "A-549" with accesion id of "CVCL_0023"
which does not appear in even the first 10 results (note: it would be the 40th result
which does not appear in even the first 10 results (it would be the 40th result
if we adjust the `numResults` parameter) and while we could manually adjust every query
to match the standard name, this is not feasible for hundreds of cell lines.

Expand All @@ -95,34 +95,34 @@ results based on the accession id of the parent cell line.
mapCell2Accession("A549", from = "id", numResults=1, prioritizeParent=TRUE)
```

Another example using the BT474 cell line.

``` {r map BT474 each}
mapCell2Accession("BT474", from = "id", numResults=1, prioritizeParent=TRUE)
mapCell2Accession("BT474", numResults=1, prioritizeParent=FALSE)
mapCell2Accession("BT474", numResults=1, prioritizeParent=TRUE)
```





### Setup
### Example pipeline to annotate a dataset
We will be working with some data from the GDSC and Cell Model Passports datasets
for this vignette. The GDSC dataset contains information about the cell lines
in the Genomics of Drug Sensitivity in Cancer (GDSC) project. The Cell Model
Passports dataset contains information about all the models in the the database.






The GDSC sampleMetadata contains two columns, `GDSC.Sample_Name` and `GDSC.COSMIC_ID`.
``` {r setup data}
data(gdsc_sampleMetadata)
data(cell_model_passports_models)
```

```{r view data}
head(gdsc_sampleMetadata)
# head(gdsc_sampleMetadata)
```

Expand All @@ -135,13 +135,13 @@ head(gdsc_sampleMetadata)


```{r view data2}
head(cell_model_passports_models)
# head(cell_model_passports_models)
```


```{r map one name to fields}
fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site")
# fields <- c("id", "ac", "sy", "sx", "ag", "derived-from-site")
# By passing the fields argument, the function will return the information for the fields specified
# AnnotationGx::mapCell2Accession(cell_model_passports_models[["CMP.model_name"]], to=fields) |> head()
Expand Down

0 comments on commit c2da4a4

Please sign in to comment.