Skip to content

Commit

Permalink
Improve getTaxonomyLabels (#672)
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman authored Jan 18, 2025
1 parent a371208 commit a8e1e0a
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 43 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mia
Type: Package
Version: 1.15.13
Version: 1.15.14
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
135 changes: 95 additions & 40 deletions R/taxonomy.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,43 +33,50 @@
#'
#' @param with.rank \code{Logical scalar}. Should the level be add as a
#' suffix? For example: "Phylum:Crenarchaeota". (Default: \code{FALSE})
#'
#'
#' @param with_rank Deprecated. Use \code{with.rank} instead.
#'
#' @param make.unique \code{Logical scalar}. Should the labels be made
#' unique, if there are any duplicates? (Default: \code{TRUE})
#'
#'
#' @param make_unique Deprecated. Use \code{make.unique} instead.
#'
#' @param resolve.loops \code{Logical scalar}. Should \code{resolveLoops}
#' be applied to the taxonomic data? Please note that has only an effect,
#' if the data is unique. (Default: \code{TRUE})
#'
#'
#' @param resolve_loops Deprecated. Use \code{resolve.loops} instead.
#'
#' @param taxa \code{Character vector}. Used for subsetting the
#' @param taxa \code{Character vector}. Used for subsetting the
#' taxonomic information. If no information is found,\code{NULL} is returned
#' for the individual element. (Default: \code{NULL})
#'
#' @param from
#' @param from
#' \itemize{
#' \item For \code{mapTaxonomy}: \code{character scalar}. A value which
#' \item For \code{mapTaxonomy}: \code{character scalar}. A value which
#' must be a valid taxonomic rank. (Default: \code{NULL})
#' \item otherwise a \code{Taxa} object as returned by
#' \item otherwise a \code{Taxa} object as returned by
#' \code{\link[DECIPHER:IdTaxa]{IdTaxa}}
#' }
#'
#' @param to \code{Character Scalar}. Must be a valid
#' @param to \code{Character Scalar}. Must be a valid
#' taxonomic rank. (Default: \code{NULL})
#'
#'
#' @param use.grepl \code{Logical}. Should pattern matching via
#' \code{grepl} be used? Otherwise literal matching is used.
#' (Default: \code{FALSE})
#'
#'
#' @param use_grepl Deprecated. Use \code{use.grepl} instead.
#'
#' @param ... optional arguments not used currently.
#'
#' @param ... additional arguments
#' \itemize{
#' \item \code{lowest.rank}: A lowest taxonomy level to be considered in
#' \code{getTaxonomyLabels}. Ranks lower than this will be collapsed into rank
#' specified by \code{lowest.rank}. For example, if genus level is specified,
#' species will be collapsed into genus. If \code{NULL}, the data is not
#' collapsed. (Default: \code{NULL})
#' }
#'
#' @param ranks \code{Character vector}. A vector of ranks to be set.
#' @details
#' Taxonomic information from the \code{IdTaxa} function of \code{DECIPHER}
Expand All @@ -84,9 +91,9 @@
#' \item \code{taxonomyRanks}: a \code{character} vector with all the
#' taxonomic ranks found in \code{colnames(rowData(x))}
#' \item \code{taxonomyRankEmpty}: a \code{logical} value
#' \item \code{mapTaxonomy}: a \code{list} per element of taxa. Each
#' \item \code{mapTaxonomy}: a \code{list} per element of taxa. Each
#' element is either a \code{DataFrame}, a \code{character} or \code{NULL}.
#' If all \code{character} results have the length of one, a single
#' If all \code{character} results have the length of one, a single
#' \code{character} vector is returned.
#' }
#'
Expand All @@ -107,22 +114,27 @@
#' table(taxonomyRankEmpty(GlobalPatterns,"Species"))
#'
#' getTaxonomyLabels(GlobalPatterns[1:20,])
#'
#' # Taxonomy labels represent the lowest taxonomy name that identifies each
#' # taxa. For instance, they can represent OTUs which does no necessarily
#' # tell much. In this case, you might want to get the labels with higher
#' # taxonomy rank
#' getTaxonomyLabels(GlobalPatterns[1:20,], lowest.rank = "Class")
#'
#' # mapTaxonomy
#' ## returns the unique taxonomic information
#' mapTaxonomy(GlobalPatterns)
#' # returns specific unique taxonomic information
#' mapTaxonomy(GlobalPatterns, taxa = "Escherichia")
#' # returns information on a single output
#' mapTaxonomy(GlobalPatterns, taxa = "Escherichia",to="Family")
#'
#'
#' # setTaxonomyRanks
#' tse <- GlobalPatterns
#' colnames(rowData(tse))[1] <- "TAXA1"
#'
#'
#' setTaxonomyRanks(colnames(rowData(tse)))
#' # Taxonomy ranks set to: taxa1 phylum class order family genus species
#'
#' # Taxonomy ranks set to: taxa1 phylum class order family genus species
#'
#' # getTaxonomyRanks is to get/check if the taxonomic ranks is set to "TAXA1"
#' getTaxonomyRanks()
NULL
Expand Down Expand Up @@ -227,7 +239,7 @@ setMethod("checkTaxonomy", signature = c(x = "SummarizedExperiment"),
setTaxonomyRanks <- function(ranks) {
ranks <- tolower(ranks)
# Check if rank is a character vector with length >= 1
if (!is.character(ranks) || length(ranks) < 1
if (!is.character(ranks) || length(ranks) < 1
|| any(ranks == "" | ranks == " " | ranks == "\t" | ranks == "-" |
ranks == "_")
|| any(grepl("\\s{2,}", ranks))) {
Expand Down Expand Up @@ -316,9 +328,15 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
stop("'resolve.loops' must be TRUE or FALSE.", call. = FALSE)
}
#
dup <- duplicated(rowData(x)[,taxonomyRanks(x)])
# Collapse taxonomy ranks if user has specified so
x <- .collapse_lowest_taxonomy_ranks(x, ...)

dup <- duplicated(rowData(x)[,taxonomyRanks(x), drop = FALSE])
if(any(dup)){
td <- apply(rowData(x)[,taxonomyRanks(x)],1L,paste,collapse = "___")
td <- apply(
rowData(x)[,taxonomyRanks(x), drop = FALSE],
1L,
paste, collapse = "___")
td_non_dup <- td[!dup]
m <- match(td, td_non_dup)
}
Expand All @@ -339,6 +357,41 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
}
)

# This function is for collapsing the lowest taxonomy ranks into single value.
# For instance, if user specifies genus rank, genus and species are collapsed
# into one, and species rank is removed from the taxonomy table. If family is
# specified, along with these two, also family is incorporated into this value.
.collapse_lowest_taxonomy_ranks <- function(
x, lowest.rank = NULL, empty.fields = c(NA, "", " ", "\t", "-", "_"),
sep = "_", ...){
if( !.is_a_string(sep) ){
stop("'sep' must be a single character value.", call. = FALSE)
}
# By default, we keep the taxonomy table untouched.
if( !is.null(lowest.rank) ){
.check_taxonomic_rank(lowest.rank, x)
# Get available taxonomy ranks
available_ranks <- taxonomyRanks(x)
# Get indices of ranks that we are going to collapse into one
mod_ranks <- seq(
which(available_ranks == lowest.rank), length(available_ranks))
# For each row, collapse ranks into one
new_rank <- apply(rowData(x)[, mod_ranks, drop = FALSE], 1, function(x){
# Check if empty, and replace with NA if it is
x[ x %in% empty.fields ] <- NA
# Collapse values
x <- paste(na.omit(x), collapse = sep)
return(x)
})
# Remove the collapsed ranks from the original table
rowData(x) <- rowData(x)[, -mod_ranks, drop = FALSE]
# Add the collapsed values to the taxonomy table
new_rank <- unname(new_rank)
rowData(x)[[lowest.rank]] <- new_rank
}
return(x)
}

#' @importFrom IRanges CharacterList LogicalList
.get_tax_ranks_selected <- function(x,rd, tax_cols, empty.fields){
# We need DataFrame here to handle cases with a single entry in tax_cols
Expand Down Expand Up @@ -415,46 +468,48 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
}

#' Calculate hierarchy tree
#'
#'
#' These functions generate a hierarchy tree using taxonomic information from a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{SummarizedExperiment}}
#' object and add this hierarchy tree into the \code{rowTree}.
#'
#'
#' @inheritParams taxonomy-methods
#'
#'
#' @param ... optional arguments not used currently.
#'
#' @details
#'
#' \code{addHierarchyTree} calculates a hierarchy tree from the available
#'
#' \code{addHierarchyTree} calculates a hierarchy tree from the available
#' taxonomic information and add it to \code{rowTree}.
#'
#'
#' \code{getHierarchyTree} generates a hierarchy tree from the available
#' taxonomic information. Internally it uses
#' \code{\link[TreeSummarizedExperiment:toTree]{toTree}} and
#' \code{\link[TreeSummarizedExperiment:resolveLoop]{resolveLoop}} to sanitize
#' data if needed.
#'
#'
#' Please note that a hierarchy tree is not an actual phylogenetic tree.
#' A phylogenetic tree represents evolutionary relationships among features.
#' On the other hand, a hierarchy tree organizes species into a hierarchical
#' structure based on their taxonomic ranks.
#'
#' structure based on their taxonomic ranks.
#'
#' @return
#' \itemize{
#' \item \code{addHierarchyTree}: a \code{TreeSummarizedExperiment} whose
#' \code{phylo} tree represents the hierarchy among available taxonomy
#' \code{phylo} tree represents the hierarchy among available taxonomy
#' information.
#' \item \code{getHierarchyTree}: a \code{phylo} tree representing the
#' \item \code{getHierarchyTree}: a \code{phylo} tree representing the
#' hierarchy among available taxonomy information.
#' }
#'
#'
#' @name hierarchy-tree
#'
#'
#' @examples
#' # Generate a tree based on taxonomic rank hierarchy (a hierarchy tree).
#' data(GlobalPatterns)
#' tse <- GlobalPatterns
#' getHierarchyTree(tse)
#'
#'
#' # Add a hierarchy tree to a TreeSummarizedExperiment.
#' # Please note that any tree already stored in rowTree() will be overwritten.
#' tse <- addHierarchyTree(tse)
Expand All @@ -470,7 +525,7 @@ setMethod("getHierarchyTree", signature = c(x = "SummarizedExperiment"),
# Input check
# If there is no rowData it is not possible to create rowTree
if( ncol(rowData(x)) == 0L ){
stop("'x' does not have rowData. Tree cannot be created.",
stop("'x' does not have rowData. Tree cannot be created.",
call. = FALSE)
}
# If there are no taxonomy ranks
Expand Down Expand Up @@ -601,7 +656,7 @@ setMethod("mapTaxonomy", signature = c(x = "SummarizedExperiment"),
stop("'from' must be an element of taxonomyRanks(x).",
call. = FALSE)
}
}
}
if(!is.null(to)){
if(!.is_a_string(to)){
stop("'to' must be a single character value.",
Expand Down Expand Up @@ -683,13 +738,13 @@ setMethod("mapTaxonomy", signature = c(x = "SummarizedExperiment"),
}

#' @importFrom SummarizedExperiment rowData
.get_tax_groups <- function(x, col, ignore.taxonomy = onRankOnly,
.get_tax_groups <- function(x, col, ignore.taxonomy = onRankOnly,
onRankOnly = FALSE, ...){
# input check
if(!.is_a_bool(ignore.taxonomy)){
stop("'ignore.taxonomy' must be TRUE or FALSE.", call. = FALSE)
}

tax_cols <- .get_tax_cols_from_se(x)
tax_col_n <- seq_along(tax_cols)
if(length(tax_col_n) < col){
Expand Down
16 changes: 14 additions & 2 deletions man/taxonomy-methods.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions tests/testthat/test-2taxonomy.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@ test_that("taxonomy", {
expect_equal(getTaxonomyLabels(xtse, resolve.loops = TRUE),
c("Family:j","Phylum:a","Family:k","Family:l","Family:m",
"Family:n","Family:o_1","Phylum:c","Family:o_2"))
# Check that lowest.rank works
data(GlobalPatterns)
tse <- GlobalPatterns
labs <- getTaxonomyLabels(tse, lowest.rank = "Kingdom", with.rank = TRUE)
expect_true( all(unlist(lapply(labs, grepl, pattern = "Kingdom:"))) )
labs <- getTaxonomyLabels(tse, lowest.rank = "Class", with.rank = TRUE)
expect_true( all(!unlist(lapply(labs, grepl, pattern = "Order|FamilySpecies|Genus"))) )
#
tse <- agglomerateByRank(tse, rank = "Phylum")
labs <- getTaxonomyLabels(tse)
labs1 <- getTaxonomyLabels(tse, lowest.rank = "Class")
expect_equal(labs, labs1)

# addHierarchyTree
data(GlobalPatterns, package="mia")
Expand Down

0 comments on commit a8e1e0a

Please sign in to comment.