diff --git a/DESCRIPTION b/DESCRIPTION index 4e42624f..fe278753 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Suggests: knitr, RcppHNSW, rmarkdown, + rnndescent, RSpectra, testthat LinkingTo: diff --git a/NEWS.md b/NEWS.md index f2f20a2e..786a580d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,7 +13,20 @@ for details on these parameters. Although typically faster than Annoy (for a given accuracy), be aware that the only supported `metric` values are `"euclidean"`, `"cosine"` and `"correlation"`. Finally, RcppHNSW is only a suggested package, not a requirement, so you need to install it yourself (e.g. -via `install.packages("RcppHNSW")`). +via `install.packages("RcppHNSW")`). Also see the +[article on HNSW in uwot](https://jlmelville.github.io/uwot/articles/hnsw-umap.html) +in the documentation. +* The nearest neighbor descent approximate nearest neighbor search algorithm is +now supported via the +[rnndescent](https://cran.r-project.org/package=rnndescent) package. Set +`nn_method = "nndescent"` to use it. The behavior of the method can be +controlled by the new `nn_args` parameter. There are many supported metrics and +possible parameters that can be set in `nn_args`, so please see the +[article on nearest neighbor descent in uwot](https://jlmelville.github.io/uwot/articles/rnndescent-umap.html) +in the documentation, and also the rnndescent package's +[documentation](https://jlmelville.github.io/rnndescent/index.html) for details. +`rnndescent` is only a suggested package, not a requirement, so you need to +install it yourself (e.g. via `install.packages("rnndescent")`). ## Bug fixes and minor improvements diff --git a/R/neighbors.R b/R/neighbors.R index 2c03a637..ba84e164 100644 --- a/R/neighbors.R +++ b/R/neighbors.R @@ -50,36 +50,21 @@ find_nn <- function(X, k, include_self = TRUE, method = "fnn", ) }, "hnsw" = { - nn_args_names <- names(nn_args) - - if ("M" %in% nn_args_names) { - M <- nn_args$M - } - else { - M <- 16 - } - - if ("ef_construction" %in% nn_args_names) { - ef_construction <- nn_args$ef_construction - } - else { - ef_construction <- 200 - } - - if ("ef" %in% nn_args_names) { - ef <- nn_args$ef - } - else { - ef <- 10 - } - - res <- hnsw_nn( + nn_args$X <- X + nn_args$k <- k + nn_args$metric <- metric + nn_args$n_threads <- n_threads + nn_args$verbose <- verbose + nn_args$ret_index <- ret_index + + res <- do.call(hnsw_nn, nn_args) + }, + "nndescent" = { + res <- nndescent_nn( X, k = k, metric = metric, - M = M, - ef_construction = ef_construction, - ef = ef, + nn_args = nn_args, n_threads = n_threads, ret_index = ret_index, verbose = verbose diff --git a/R/nn_hnsw.R b/R/nn_hnsw.R index b648d0e5..214c31c3 100644 --- a/R/nn_hnsw.R +++ b/R/nn_hnsw.R @@ -102,3 +102,8 @@ hnsw_load <- function(name, ndim, filename) { ) methods::new(class_name, ndim, filename) } + +is_ok_hnsw_metric <- function(metric) { + hnsw_metrics <- c("euclidean", "cosine", "correlation") + metric %in% hnsw_metrics +} diff --git a/R/nn_nndescent.R b/R/nn_nndescent.R new file mode 100644 index 00000000..ccafb5a1 --- /dev/null +++ b/R/nn_nndescent.R @@ -0,0 +1,157 @@ +nndescent_nn <- function(X, + k = 10, + metric = "euclidean", + nn_args = list(), + n_threads = NULL, + ret_index = FALSE, + verbose = FALSE) { + if (is.null(n_threads)) { + n_threads <- default_num_threads() + } + + if (!ret_index) { + nn_knn_args <- get_nndescent_knn_args(nn_args) + nn_knn_args <- lmerge( + nn_knn_args, + list( + data = X, + k = k, + metric = metric, + n_threads = n_threads, + verbose = verbose + ) + ) + return(do.call(rnndescent::rnnd_knn, nn_knn_args)) + } + + ann <- nndescent_build( + X, + k, + metric, + nn_args = nn_args, + n_threads = n_threads, + verbose = verbose + ) + res <- + list( + idx = ann$ann$graph$idx, + dist = ann$ann$graph$dist, + index = ann + ) + res$index$ann$ann$graph <- NULL + res +} + +nndescent_build <- function(X, + k, + metric, + nn_args = list(), + n_threads = NULL, + verbose = FALSE) { + nn_build_args <- get_nndescent_build_args(nn_args) + nn_build_args <- lmerge( + nn_build_args, + list( + data = X, + k = k, + metric = metric, + n_threads = n_threads, + verbose = verbose + ) + ) + + index <- do.call(rnndescent::rnnd_build, nn_build_args) + list( + ann = index, + type = "nndescentv1", + metric = metric, + ndim = ncol(X) + ) +} + + +nndescent_search <- function(X, + k, + ann, + nn_args = list(), + n_threads = NULL, + verbose = FALSE) { + nn_query_args <- get_nndescent_query_args(nn_args) + nn_query_args <- lmerge( + nn_query_args, + list( + index = ann$ann, + query = X, + k = k, + n_threads = n_threads, + verbose = verbose + ) + ) + + do.call(rnndescent::rnnd_query, nn_query_args) +} + +get_nndescent_knn_args <- function(nn_args) { + nn_knn_args <- list() + nnd_knn_names <- c( + "use_alt_metric", + "init", + "n_trees", + "leaf_size", + "max_tree_depth", + "margin", + "n_iters", + "delta", + "max_candidates", + "weight_by_degree", + "low_memory" + ) + for (name in nnd_knn_names) { + if (name %in% names(nn_args)) { + nn_knn_args[[name]] <- nn_args[[name]] + } + } + nn_knn_args +} + +get_nndescent_build_args <- function(nn_args) { + # prune_reverse should probably always be TRUE + nn_build_args <- list(prune_reverse = TRUE) + nnd_build_names <- c( + "use_alt_metric", + "init", + "n_trees", + "leaf_size", + "max_tree_depth", + "margin", + "n_iters", + "delta", + "max_candidates", + "weight_by_degree", + "low_memory", + "n_search_trees", + "pruning_degree_multiplier", + "diversify_prob", + "prune_reverse" + ) + for (name in nnd_build_names) { + if (name %in% names(nn_args)) { + nn_build_args[[name]] <- nn_args[[name]] + } + } + nn_build_args +} + +get_nndescent_query_args <- function(nn_args) { + nn_query_args <- list() + nnd_query_names <- c( + "epsilon", + "max_search_fraction" + ) + for (name in nnd_query_names) { + if (name %in% names(nn_args)) { + nn_query_args[[name]] <- nn_args[[name]] + } + } + nn_query_args +} diff --git a/R/transform.R b/R/transform.R index 25f461ac..a7d69def 100644 --- a/R/transform.R +++ b/R/transform.R @@ -244,6 +244,13 @@ umap_transform <- function(X = NULL, model = NULL, ) } + if (is.character(model$nn_method) && + model$nn_method == "nndescent" && !is_installed("rnndescent")) { + stop( + "This model requires the rnndescent package to be installed." + ) + } + if (is.null(n_epochs)) { n_epochs <- model$n_epochs if (is.null(n_epochs)) { @@ -562,6 +569,17 @@ umap_transform <- function(X = NULL, model = NULL, nn$dist <- sqrt(nn$dist) } } + else if (startsWith(ann$type, "nndescent")) { + nn <- + nndescent_search( + X, + k = n_neighbors, + ann = ann, + nn_args = model$nn_args, + n_threads = n_threads, + verbose = verbose + ) + } else { stop("Unknown nn method: ", ann$type) } diff --git a/R/uwot.R b/R/uwot.R index fab17834..51424f53 100644 --- a/R/uwot.R +++ b/R/uwot.R @@ -1,7 +1,7 @@ #' Dimensionality Reduction with UMAP #' #' Carry out dimensionality reduction of a dataset using the Uniform Manifold -#' Approximation and Projection (UMAP) method (McInnes & Healy, 2018). Some of +#' Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of #' the following help text is lifted verbatim from the Python reference #' implementation at \url{https://github.com/lmcinnes/umap}. #' @@ -38,8 +38,19 @@ #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -#' distance metric is always "euclidean"). -#' +#' distance metric is always "euclidean"). If +#' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +#' and \code{nn_method = "hnsw"} is specified then only the following metrics +#' are available: +#' \itemize{ +#' \item \code{"euclidean"} +#' \item \code{"cosine"} +#' \item \code{"correlation"} +#' } +#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +#' installed and \code{nn_method = "nndescent"} is specified then many more +#' metrics are avaiable. For more details see the package documentation of +#' \code{rnndescent}. #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in #' the list is one of the metric names above. The value of each list item should @@ -178,13 +189,21 @@ #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. -#' \item \code{"hnsw"} Use approximate nearest neighbors via the +#' \item \code{"hnsw"} Use approximate nearest neighbors with the +#' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, +#' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is -#' only available if you have installed \code{RcppHNSW} yourself. Only -#' Also, HNSW only supports the following arguments for \code{metric} and +#' only available if you have installed \code{RcppHNSW} yourself. Also, +#' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. +#' \item \code{"nndescent"} Use approximate nearest neighbors with the +#' Nearest Neighbor Descent method (Dong et al., 2011) via the +#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} +#' package. \code{rnndescent} is not a dependency of this package: this +#' option is only available if you have installed \code{rnndescent} +#' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -234,6 +253,54 @@ #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } +#' For \code{nn_method = "nndescent"}, you may specify the following +#' arguments: +#' \itemize{ +#' \item \code{n_trees} The number of trees to use in a random projection +#' forest to initialize the search. A larger number will give more accurate +#' results at the cost of a longer computation time. The default of +#' \code{NULL} means that the number is chosen based on the number of +#' observations in \code{X}. +#' \item \code{max_candidates} The number of potential neighbors to explore +#' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +#' whichever is smaller. A larger number will give more accurate results at +#' the cost of a longer computation time. +#' \item \code{n_iters} The number of iterations to run the search. A larger +#' number will give more accurate results at the cost of a longer computation +#' time. By default, this will be chosen based on the number of observations +#' in \code{X}. You may also need to modify the convergence criterion +#' \code{delta}. +#' \item \code{delta} The minimum relative change in the neighbor graph +#' allowed before early stopping. Should be a value between 0 and 1. The +#' smaller the value, the smaller the amount of progress between iterations is +#' allowed. Default value of \code{0.001} means that at least 0.1% of the +#' neighbor graph must be updated at each iteration. +#' \item \code{init} How to initialize the nearest neighbor descent. By +#' default this is set to \code{"tree"} and uses a random project forest. +#' If you set this to \code{"rand"}, then a random selection is used. Usually +#' this is less accurate than using RP trees, but for high-dimensional cases, +#' there may be little difference in the quality of the initialization and +#' random initialization will be a lot faster. If you set this to +#' \code{"rand"}, then the \code{n_trees} parameter is ignored. +#' \item \code{pruning_degree_multiplier} The maximum number of edges per node +#' to retain in the search graph, relative to \code{n_neighbors}. A larger +#' value will give more accurate results at the cost of a longer computation +#' time. Default is \code{1.5}. This parameter only affects neighbor search +#' when transforming new data with \code{\link{umap_transform}}. +#' \item \code{epsilon} Controls the degree of the back-tracking when +#' traversing the search graph. Setting this to \code{0.0} will do a greedy +#' search with no back-tracking. A larger value will give more accurate +#' results at the cost of a longer computation time. Default is \code{0.1}. +#' This parameter only affects neighbor search when transforming new data with +#' \code{\link{umap_transform}}. +#' \item \code{max_search_fraction} Specifies the maximum fraction of the +#' search graph to traverse. By default, this is set to \code{1.0}, so the +#' entire graph (i.e. all items in \code{X}) may be visited. You may want to +#' set this to a smaller value if you have a very large dataset (in +#' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +#' of the data in \code{X}. This parameter only affects neighbor search when +#' transforming new data with \code{\link{umap_transform}}. +#' } #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. @@ -539,12 +606,24 @@ #' \emph{Advances in Neural Information Processing Systems}, \emph{34}. #' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} #' +#' Dong, W., Moses, C., & Li, K. (2011, March). +#' Efficient k-nearest neighbor graph construction for generic similarity measures. +#' In \emph{Proceedings of the 20th international conference on World Wide Web} +#' (pp. 577-586). +#' ACM. +#' \doi{10.1145/1963405.1963487}. +#' #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' -#' McInnes, L., & Healy, J. (2018). +#' Malkov, Y. A., & Yashunin, D. A. (2018). +#' Efficient and robust approximate nearest neighbor search using hierarchical +#' navigable small world graphs. +#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. +#' +#' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} @@ -642,13 +721,14 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP) #' -#' A faster (but less flexible) version of the UMAP gradient. For more detail on -#' UMAP, see the \code{\link{umap}} function. +#' A faster (but less flexible) version of the UMAP (McInnes et al, 2018) +#' gradient. For more detail on UMAP, see the \code{\link{umap}} function. #' #' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you -#' get back the Cauchy distribution as used in t-SNE and LargeVis. It also -#' results in a substantially simplified gradient expression. This can give -#' a speed improvement of around 50\%. +#' get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton, +#' 2008) and LargeVis (Tang et al., 2016). It also results in a substantially +#' simplified gradient expression. This can give a speed improvement of around +#' 50\%. #' #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}}, #' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}. @@ -683,7 +763,19 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -#' distance metric is always "euclidean"). +#' distance metric is always "euclidean"). If +#' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +#' and \code{nn_method = "hnsw"} is specified then only the following metrics +#' are available: +#' \itemize{ +#' \item \code{"euclidean"} +#' \item \code{"cosine"} +#' \item \code{"correlation"} +#' } +#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +#' installed and \code{nn_method = "nndescent"} is specified then many more +#' metrics are avaiable. For more details see the package documentation of +#' \code{rnndescent}. #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in @@ -808,13 +900,21 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. -#' \item \code{"hnsw"} Use approximate nearest neighbors via the +#' \item \code{"hnsw"} Use approximate nearest neighbors with the +#' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, +#' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is -#' only available if you have installed \code{RcppHNSW} yourself. -#' Also, HNSW only supports the following arguments for \code{metric} and +#' only available if you have installed \code{RcppHNSW} yourself. Also, +#' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. +#' \item \code{"nndescent"} Use approximate nearest neighbors with the +#' Nearest Neighbor Descent method (Dong et al., 2011) via the +#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} +#' package. \code{rnndescent} is not a dependency of this package: this +#' option is only available if you have installed \code{rnndescent} +#' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -864,6 +964,102 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } +#' For \code{nn_method = "nndescent"}, you may specify the following +#' arguments: +#' \itemize{ +#' \item \code{n_trees} The number of trees to use in a random projection +#' forest to initialize the search. A larger number will give more accurate +#' results at the cost of a longer computation time. The default of +#' \code{NULL} means that the number is chosen based on the number of +#' observations in \code{X}. +#' \item \code{max_candidates} The number of potential neighbors to explore +#' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +#' whichever is smaller. A larger number will give more accurate results at +#' the cost of a longer computation time. +#' \item \code{n_iters} The number of iterations to run the search. A larger +#' number will give more accurate results at the cost of a longer computation +#' time. By default, this will be chosen based on the number of observations +#' in \code{X}. You may also need to modify the convergence criterion +#' \code{delta}. +#' \item \code{delta} The minimum relative change in the neighbor graph +#' allowed before early stopping. Should be a value between 0 and 1. The +#' smaller the value, the smaller the amount of progress between iterations is +#' allowed. Default value of \code{0.001} means that at least 0.1% of the +#' neighbor graph must be updated at each iteration. +#' \item \code{init} How to initialize the nearest neighbor descent. By +#' default this is set to \code{"tree"} and uses a random project forest. +#' If you set this to \code{"rand"}, then a random selection is used. Usually +#' this is less accurate than using RP trees, but for high-dimensional cases, +#' there may be little difference in the quality of the initialization and +#' random initialization will be a lot faster. If you set this to +#' \code{"rand"}, then the \code{n_trees} parameter is ignored. +#' \item \code{pruning_degree_multiplier} The maximum number of edges per node +#' to retain in the search graph, relative to \code{n_neighbors}. A larger +#' value will give more accurate results at the cost of a longer computation +#' time. Default is \code{1.5}. This parameter only affects neighbor search +#' when transforming new data with \code{\link{umap_transform}}. +#' \item \code{epsilon} Controls the degree of the back-tracking when +#' traversing the search graph. Setting this to \code{0.0} will do a greedy +#' search with no back-tracking. A larger value will give more accurate +#' results at the cost of a longer computation time. Default is \code{0.1}. +#' This parameter only affects neighbor search when transforming new data with +#' \code{\link{umap_transform}}. +#' \item \code{max_search_fraction} Specifies the maximum fraction of the +#' search graph to traverse. By default, this is set to \code{1.0}, so the +#' entire graph (i.e. all items in \code{X}) may be visited. You may want to +#' set this to a smaller value if you have a very large dataset (in +#' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +#' of the data in \code{X}. This parameter only affects neighbor search when +#' transforming new data with \code{\link{umap_transform}}. +#' } +#' For \code{nn_method = "nndescent"}, you may specify the following +#' arguments: +#' \itemize{ +#' \item \code{n_trees} The number of trees to use in a random projection +#' forest to initialize the search. A larger number will give more accurate +#' results at the cost of a longer computation time. The default of +#' \code{NULL} means that the number is chosen based on the number of +#' observations in \code{X}. +#' \item \code{max_candidates} The number of potential neighbors to explore +#' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +#' whichever is smaller. A larger number will give more accurate results at +#' the cost of a longer computation time. +#' \item \code{n_iters} The number of iterations to run the search. A larger +#' number will give more accurate results at the cost of a longer computation +#' time. By default, this will be chosen based on the number of observations +#' in \code{X}. You may also need to modify the convergence criterion +#' \code{delta}. +#' \item \code{delta} The minimum relative change in the neighbor graph +#' allowed before early stopping. Should be a value between 0 and 1. The +#' smaller the value, the smaller the amount of progress between iterations is +#' allowed. Default value of \code{0.001} means that at least 0.1% of the +#' neighbor graph must be updated at each iteration. +#' \item \code{init} How to initialize the nearest neighbor descent. By +#' default this is set to \code{"tree"} and uses a random project forest. If +#' you set this to \code{"rand"}, then a random selection is used. Usually +#' this is less accurate than using RP trees, but for high-dimensional cases, +#' there may be little difference in the quality of the initialization and +#' random initialization will be a lot faster. If you set this to +#' \code{"rand"}, then the \code{n_trees} parameter is ignored. +#' \item \code{pruning_degree_multiplier} The maximum number of edges per node +#' to retain in the search graph, relative to \code{n_neighbors}. A larger +#' value will give more accurate results at the cost of a longer computation +#' time. Default is \code{1.5}. This parameter only affects neighbor search +#' when transforming new data with \code{\link{umap_transform}}. +#' \item \code{epsilon} Controls the degree of the back-tracking when +#' traversing the search graph. Setting this to \code{0.0} will do a greedy +#' search with no back-tracking. A larger value will give more accurate +#' results at the cost of a longer computation time. Default is \code{0.1}. +#' This parameter only affects neighbor search when transforming new data with +#' \code{\link{umap_transform}}. +#' \item \code{max_search_fraction} Specifies the maximum fraction of the +#' search graph to traverse. By default, this is set to \code{1.0}, so the +#' entire graph (i.e. all items in \code{X}) may be visited. You may want to +#' set this to a smaller value if you have a very large dataset (in +#' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +#' of the data in \code{X}. This parameter only affects neighbor search when +#' transforming new data with \code{\link{umap_transform}}. +#' } #' @param y Optional target data for supervised dimension reduction. Can be a #' vector, matrix or data frame. Use the \code{target_metric} parameter to #' specify the metrics to use, using the same syntax as \code{metric}. Usually @@ -1115,6 +1311,68 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. #' @examples #' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) +#' +#' @references +#' Belkin, M., & Niyogi, P. (2002). +#' Laplacian eigenmaps and spectral techniques for embedding and clustering. +#' In \emph{Advances in neural information processing systems} +#' (pp. 585-591). +#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} +#' +#' Böhm, J. N., Berens, P., & Kobak, D. (2020). +#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. +#' \emph{arXiv preprint} \emph{arXiv:2007.08902}. +#' \url{https://arxiv.org/abs/2007.08902} +#' +#' Damrich, S., & Hamprecht, F. A. (2021). +#' On UMAP's true loss function. +#' \emph{Advances in Neural Information Processing Systems}, \emph{34}. +#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} +#' +#' Dong, W., Moses, C., & Li, K. (2011, March). +#' Efficient k-nearest neighbor graph construction for generic similarity measures. +#' In \emph{Proceedings of the 20th international conference on World Wide Web} +#' (pp. 577-586). +#' ACM. +#' \doi{10.1145/1963405.1963487}. +#' +#' Kingma, D. P., & Ba, J. (2014). +#' Adam: A method for stochastic optimization. +#' \emph{arXiv preprint} \emph{arXiv}:1412.6980. +#' \url{https://arxiv.org/abs/1412.6980} +#' +#' Malkov, Y. A., & Yashunin, D. A. (2018). +#' Efficient and robust approximate nearest neighbor search using hierarchical +#' navigable small world graphs. +#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. +#' +#' McInnes, L., Healy, J., & Melville, J. (2018). +#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction +#' \emph{arXiv preprint} \emph{arXiv}:1802.03426. +#' \url{https://arxiv.org/abs/1802.03426} +#' +#' O’Neill, M. E. (2014). +#' \emph{PCG: A family of simple fast space-efficient statistically good +#' algorithms for random number generation} +#' (Report No. HMC-CS-2014-0905). Harvey Mudd College. +#' +#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). +#' Visualizing large-scale and high-dimensional data. +#' In \emph{Proceedings of the 25th International Conference on World Wide Web} +#' (pp. 287-297). +#' International World Wide Web Conferences Steering Committee. +#' \url{https://arxiv.org/abs/1602.00370} +#' +#' Van der Maaten, L., & Hinton, G. (2008). +#' Visualizing data using t-SNE. +#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). +#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} +#' +#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). +#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. +#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. +#' \url{https://www.jmlr.org/papers/v22/20-1061.html} +#' #' @export tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, @@ -1236,7 +1494,19 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -#' distance metric is always "euclidean"). +#' distance metric is always "euclidean"). If +#' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +#' and \code{nn_method = "hnsw"} is specified then only the following metrics +#' are available: +#' \itemize{ +#' \item \code{"euclidean"} +#' \item \code{"cosine"} +#' \item \code{"correlation"} +#' } +#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +#' installed and \code{nn_method = "nndescent"} is specified then many more +#' metrics are avaiable. For more details see the package documentation of +#' \code{rnndescent}. #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in @@ -1347,13 +1617,20 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. -#' \item \code{"hnsw"} Use approximate nearest neighbors via the +#' \item \code{"hnsw"} Use approximate nearest neighbors with the +#' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, +#' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is -#' only available if you have installed \code{RcppHNSW} yourself. -#' Also, HNSW only supports the following arguments for \code{metric} and -#' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and -#' \code{"correlation"}. +#' only available if you have installed \code{RcppHNSW} yourself. Also, +#' HNSW only supports the following arguments for \code{metric}: +#' \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. +#' \item \code{"nndescent"} Use approximate nearest neighbors with the +#' Nearest Neighbor Descent method (Dong et al., 2011) via the +#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} +#' package. \code{rnndescent} is not a dependency of this package: this +#' option is only available if you have installed \code{rnndescent} +#' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -1399,6 +1676,36 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } +#' For \code{nn_method = "nndescent"}, you may specify the following +#' arguments: +#' \itemize{ +#' \item \code{n_trees} The number of trees to use in a random projection +#' forest to initialize the search. A larger number will give more accurate +#' results at the cost of a longer computation time. The default of +#' \code{NULL} means that the number is chosen based on the number of +#' observations in \code{X}. +#' \item \code{max_candidates} The number of potential neighbors to explore +#' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +#' whichever is smaller. A larger number will give more accurate results at +#' the cost of a longer computation time. +#' \item \code{n_iters} The number of iterations to run the search. A larger +#' number will give more accurate results at the cost of a longer computation +#' time. By default, this will be chosen based on the number of observations +#' in \code{X}. You may also need to modify the convergence criterion +#' \code{delta}. +#' \item \code{delta} The minimum relative change in the neighbor graph +#' allowed before early stopping. Should be a value between 0 and 1. The +#' smaller the value, the smaller the amount of progress between iterations is +#' allowed. Default value of \code{0.001} means that at least 0.1% of the +#' neighbor graph must be updated at each iteration. +#' \item \code{init} How to initialize the nearest neighbor descent. By +#' default this is set to \code{"tree"} and uses a random project forest. +#' If you set this to \code{"rand"}, then a random selection is used. Usually +#' this is less accurate than using RP trees, but for high-dimensional cases, +#' there may be little difference in the quality of the initialization and +#' random initialization will be a lot faster. If you set this to +#' \code{"rand"}, then the \code{n_trees} parameter is ignored. +#' } #' @param n_threads Number of threads to use (except during stochastic gradient #' descent). Default is half the number of concurrent threads supported by the #' system. For nearest neighbor search, only applies if @@ -1573,18 +1880,6 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' } #' The returned list contains the combined data from any combination of #' specifying \code{ret_nn} and \code{ret_extra}. -#' @references -#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). -#' Visualizing large-scale and high-dimensional data. -#' In \emph{Proceedings of the 25th International Conference on World Wide Web} -#' (pp. 287-297). -#' International World Wide Web Conferences Steering Committee. -#' \url{https://arxiv.org/abs/1602.00370} -#' -#' Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). -#' Multi-scale similarities in stochastic neighbour embedding: Reducing -#' dimensionality while preserving both local and global structure. -#' \emph{Neurocomputing}, \emph{169}, 246-261. #' #' @examples #' # Default number of epochs is much larger than for UMAP, assumes random @@ -1595,6 +1890,73 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' perplexity = 50, learning_rate = 0.5, #' init = "random", n_epochs = 20 #' ) +#' +#' @references +#' Belkin, M., & Niyogi, P. (2002). +#' Laplacian eigenmaps and spectral techniques for embedding and clustering. +#' In \emph{Advances in neural information processing systems} +#' (pp. 585-591). +#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} +#' +#' Böhm, J. N., Berens, P., & Kobak, D. (2020). +#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. +#' \emph{arXiv preprint} \emph{arXiv:2007.08902}. +#' \url{https://arxiv.org/abs/2007.08902} +#' +#' Damrich, S., & Hamprecht, F. A. (2021). +#' On UMAP's true loss function. +#' \emph{Advances in Neural Information Processing Systems}, \emph{34}. +#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} +#' +#' Dong, W., Moses, C., & Li, K. (2011, March). +#' Efficient k-nearest neighbor graph construction for generic similarity measures. +#' In \emph{Proceedings of the 20th international conference on World Wide Web} +#' (pp. 577-586). +#' ACM. +#' \doi{10.1145/1963405.1963487}. +#' +#' Kingma, D. P., & Ba, J. (2014). +#' Adam: A method for stochastic optimization. +#' \emph{arXiv preprint} \emph{arXiv}:1412.6980. +#' \url{https://arxiv.org/abs/1412.6980} +#' +#' Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). +#' Multi-scale similarities in stochastic neighbour embedding: Reducing +#' dimensionality while preserving both local and global structure. +#' \emph{Neurocomputing}, \emph{169}, 246-261. +#' +#' Malkov, Y. A., & Yashunin, D. A. (2018). +#' Efficient and robust approximate nearest neighbor search using hierarchical +#' navigable small world graphs. +#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. +#' +#' McInnes, L., Healy, J., & Melville, J. (2018). +#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction +#' \emph{arXiv preprint} \emph{arXiv}:1802.03426. +#' \url{https://arxiv.org/abs/1802.03426} +#' +#' O’Neill, M. E. (2014). +#' \emph{PCG: A family of simple fast space-efficient statistically good +#' algorithms for random number generation} +#' (Report No. HMC-CS-2014-0905). Harvey Mudd College. +#' +#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). +#' Visualizing large-scale and high-dimensional data. +#' In \emph{Proceedings of the 25th International Conference on World Wide Web} +#' (pp. 287-297). +#' International World Wide Web Conferences Steering Committee. +#' \url{https://arxiv.org/abs/1602.00370} +#' +#' Van der Maaten, L., & Hinton, G. (2008). +#' Visualizing data using t-SNE. +#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). +#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} +#' +#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). +#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. +#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. +#' \url{https://www.jmlr.org/papers/v22/20-1061.html} +#' #' @export lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean", n_epochs = -1, @@ -1689,7 +2051,19 @@ lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, #' \item \code{"categorical"} (see below) #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -#' distance metric is always "euclidean"). +#' distance metric is always "euclidean"). If +#' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +#' and \code{nn_method = "hnsw"} is specified then only the following metrics +#' are available: +#' \itemize{ +#' \item \code{"euclidean"} +#' \item \code{"cosine"} +#' \item \code{"correlation"} +#' } +#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +#' installed and \code{nn_method = "nndescent"} is specified then many more +#' metrics are avaiable. For more details see the package documentation of +#' \code{rnndescent}. #' #' If \code{X} is a data frame or matrix, then multiple metrics can be #' specified, by passing a list to this argument, where the name of each item in @@ -1747,13 +2121,21 @@ lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, #' \href{https://cran.r-project.org/package=FNN}{FNN} package. #' \item \code{"annoy"} Use approximate nearest neighbors via the #' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. -#' \item \code{"hnsw"} Use approximate nearest neighbors via the +#' \item \code{"hnsw"} Use approximate nearest neighbors with the +#' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, +#' 2018) via the #' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. #' \code{RcppHNSW} is not a dependency of this package: this option is -#' only available if you have installed \code{RcppHNSW} yourself. -#' Also, HNSW only supports the following arguments for \code{metric} and +#' only available if you have installed \code{RcppHNSW} yourself. Also, +#' HNSW only supports the following arguments for \code{metric} and #' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and #' \code{"correlation"}. +#' \item \code{"nndescent"} Use approximate nearest neighbors with the +#' Nearest Neighbor Descent method (Dong et al., 2011) via the +#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} +#' package. \code{rnndescent} is not a dependency of this package: this +#' option is only available if you have installed \code{rnndescent} +#' yourself. #' } #' By default, if \code{X} has less than 4,096 vertices, the exact nearest #' neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -1803,6 +2185,54 @@ lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, #' and cannot be higher than the number of items in the index. Default is #' \code{10}. #' } +#' For \code{nn_method = "nndescent"}, you may specify the following +#' arguments: +#' \itemize{ +#' \item \code{n_trees} The number of trees to use in a random projection +#' forest to initialize the search. A larger number will give more accurate +#' results at the cost of a longer computation time. The default of +#' \code{NULL} means that the number is chosen based on the number of +#' observations in \code{X}. +#' \item \code{max_candidates} The number of potential neighbors to explore +#' per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +#' whichever is smaller. A larger number will give more accurate results at +#' the cost of a longer computation time. +#' \item \code{n_iters} The number of iterations to run the search. A larger +#' number will give more accurate results at the cost of a longer computation +#' time. By default, this will be chosen based on the number of observations +#' in \code{X}. You may also need to modify the convergence criterion +#' \code{delta}. +#' \item \code{delta} The minimum relative change in the neighbor graph +#' allowed before early stopping. Should be a value between 0 and 1. The +#' smaller the value, the smaller the amount of progress between iterations is +#' allowed. Default value of \code{0.001} means that at least 0.1% of the +#' neighbor graph must be updated at each iteration. +#' \item \code{init} How to initialize the nearest neighbor descent. By +#' default this is set to \code{"tree"} and uses a random project forest. +#' If you set this to \code{"rand"}, then a random selection is used. Usually +#' this is less accurate than using RP trees, but for high-dimensional cases, +#' there may be little difference in the quality of the initialization and +#' random initialization will be a lot faster. If you set this to +#' \code{"rand"}, then the \code{n_trees} parameter is ignored. +#' \item \code{pruning_degree_multiplier} The maximum number of edges per node +#' to retain in the search graph, relative to \code{n_neighbors}. A larger +#' value will give more accurate results at the cost of a longer computation +#' time. Default is \code{1.5}. This parameter only affects neighbor search +#' when transforming new data with \code{\link{umap_transform}}. +#' \item \code{epsilon} Controls the degree of the back-tracking when +#' traversing the search graph. Setting this to \code{0.0} will do a greedy +#' search with no back-tracking. A larger value will give more accurate +#' results at the cost of a longer computation time. Default is \code{0.1}. +#' This parameter only affects neighbor search when transforming new data with +#' \code{\link{umap_transform}}. +#' \item \code{max_search_fraction} Specifies the maximum fraction of the +#' search graph to traverse. By default, this is set to \code{1.0}, so the +#' entire graph (i.e. all items in \code{X}) may be visited. You may want to +#' set this to a smaller value if you have a very large dataset (in +#' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +#' of the data in \code{X}. This parameter only affects neighbor search when +#' transforming new data with \code{\link{umap_transform}}. +#' } #' @param perplexity Used only if \code{method = "largevis"}. Controls the size #' of the local neighborhood used for manifold approximation. Should be a #' value between 1 and one less than the number of items in \code{X}. If @@ -1985,6 +2415,18 @@ lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3, #' all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph) #' #' @references +#' Dong, W., Moses, C., & Li, K. (2011, March). +#' Efficient k-nearest neighbor graph construction for generic similarity measures. +#' In \emph{Proceedings of the 20th international conference on World Wide Web} +#' (pp. 577-586). +#' ACM. +#' \doi{10.1145/1963405.1963487}. +#' +#' Malkov, Y. A., & Yashunin, D. A. (2018). +#' Efficient and robust approximate nearest neighbor search using hierarchical +#' navigable small world graphs. +#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. +#' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. @@ -2306,6 +2748,11 @@ similarity_graph <- function(X = NULL, n_neighbors = NULL, metric = "euclidean", #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' +#' O’Neill, M. E. (2014). +#' \emph{PCG: A family of simple fast space-efficient statistically good +#' algorithms for random number generation} +#' (Report No. HMC-CS-2014-0905). Harvey Mudd College. +#' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} @@ -2641,14 +3088,13 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", if (!is_installed("RcppHNSW")) { stop("RcppHNSW is required for nn_method = 'hnsw', please install it") } - hnsw_metrics <- c("euclidean", "cosine", "correlation") - if (!metric %in% hnsw_metrics) { + if (!is_ok_hnsw_metric(metric)) { stop( "bad metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" ) } - if (!target_metric %in% hnsw_metrics) { + if (!is_ok_hnsw_metric(target_metric)) { stop( "bad target_metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" @@ -2656,6 +3102,13 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", } } + if (is.character(nn_method) && nn_method == "nndescent") { + if (!is_installed("rnndescent")) { + stop("rnndescent is required for nn_method = 'nndescent',", + "please install it") + } + } + ret_extra <- ret_model || ret_nn || ret_fgraph || ret_sigma || ret_localr # Store categorical columns to be used to generate the graph @@ -3230,7 +3683,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", # of them, but for loading the NN index we need the number of # columns explicitly (we don't have access to the column dimension of # the input data at load time) - if (res$nn_index$type %in% c("annoyv2", "hnswv1")) { + if (res$nn_index$type %in% c("annoyv2", "hnswv1", "nndescentv1")) { res$metric[[1]] <- list(ndim = res$nn_index$ndim) } else { @@ -3391,12 +3844,33 @@ save_uwot <- function(model, file, unload = FALSE, verbose = FALSE) { # save each nn index inside tempdir/uwot/model metrics <- names(model$metric) n_metrics <- length(metrics) + for (i in 1:n_metrics) { - nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) if (n_metrics == 1) { - model$nn_index$ann$save(nn_tmpfname) - } else { - model$nn_index[[i]]$ann$save(nn_tmpfname) + nn_index <- model$nn_index + } + else { + nn_index <- model$nn_index[[i]] + } + + if (startsWith(nn_index$type, "annoy") || + startsWith(nn_index$type, "hnsw")) { + + nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) + nn_meta_tmpfname <- file.path(uwot_dir, paste0("nn-meta", i)) + nn_index$ann$save(nn_tmpfname) + + # save metadata wrapper around the index separately + meta_data <- nn_index + meta_data$ann <- NULL + saveRDS(meta_data, file = nn_meta_tmpfname) + } + else if (startsWith(nn_index$type, "nndescent")) { + nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) + saveRDS(nn_index, file = nn_tmpfname) + } + else { + stop("unsupported nn index type: ", model$nn_index$type) } } @@ -3552,13 +4026,18 @@ load_uwot <- function(file, verbose = FALSE) { } else if (nn_method == "hnsw") { ann <- hnsw_load(metric, ndim = ndim, filename = nn_fname) - idx <- - list( - ann = ann, - type = "hnswv1", - metric = metric, - ndim = ndim - ) + nn_meta_tmpfname <- file.path(mod_dir, paste0("uwot/nn-meta", i)) + idx <- readRDS(nn_meta_tmpfname) + idx$ann <- ann + + if (n_metrics == 1) { + model$nn_index <- idx + } else { + model$nn_index[[i]] <- idx + } + } + else if (nn_method == "nndescent") { + idx <- readRDS(nn_fname) if (n_metrics == 1) { model$nn_index <- idx } else { @@ -3682,6 +4161,9 @@ all_nn_indices_are_loaded <- function(model) { else if (model$nn_index$type == "hnswv1") { return(TRUE) } + else if (model$nn_index$type == "nndescentv1") { + return(TRUE) + } else { stop("Invalid model: has unknown 'nn_index' type ", model$nn_index$type) } @@ -3936,7 +4418,8 @@ x2nn <- function(X, n_neighbors, metric, nn_method, nn <- nn_method } else { nn_method <- - match.arg(tolower(nn_method), c("annoy", "fnn", "matrix", "hnsw")) + match.arg(tolower(nn_method), + c("annoy", "fnn", "matrix", "hnsw", "nndescent")) if (nn_method == "fnn" && metric != "euclidean") { stop( "nn_method = 'FNN' is only compatible with distance metric ", diff --git a/man/lvish.Rd b/man/lvish.Rd index 02de54a9..668f9ed4 100644 --- a/man/lvish.Rd +++ b/man/lvish.Rd @@ -79,7 +79,19 @@ integer value in the range \code{2} to \code{100}.} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -distance metric is always "euclidean"). +distance metric is always "euclidean"). If +\href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +and \code{nn_method = "hnsw"} is specified then only the following metrics +are available: +\itemize{ + \item \code{"euclidean"} + \item \code{"cosine"} + \item \code{"correlation"} +} +If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +installed and \code{nn_method = "nndescent"} is specified then many more +metrics are avaiable. For more details see the package documentation of +\code{rnndescent}. If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in @@ -198,13 +210,20 @@ embedding.} \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. - \item \code{"hnsw"} Use approximate nearest neighbors via the + \item \code{"hnsw"} Use approximate nearest neighbors with the + Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, + 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is - only available if you have installed \code{RcppHNSW} yourself. - Also, HNSW only supports the following arguments for \code{metric} and - \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and - \code{"correlation"}. + only available if you have installed \code{RcppHNSW} yourself. Also, + HNSW only supports the following arguments for \code{metric}: + \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. + \item \code{"nndescent"} Use approximate nearest neighbors with the + Nearest Neighbor Descent method (Dong et al., 2011) via the + \href{https://cran.r-project.org/package=rnndescent}{rnndescent} + package. \code{rnndescent} is not a dependency of this package: this + option is only available if you have installed \code{rnndescent} + yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -425,6 +444,36 @@ Default is \code{200}. list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. +} +For \code{nn_method = "nndescent"}, you may specify the following +arguments: +\itemize{ +\item \code{n_trees} The number of trees to use in a random projection +forest to initialize the search. A larger number will give more accurate +results at the cost of a longer computation time. The default of +\code{NULL} means that the number is chosen based on the number of +observations in \code{X}. +\item \code{max_candidates} The number of potential neighbors to explore +per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +whichever is smaller. A larger number will give more accurate results at +the cost of a longer computation time. +\item \code{n_iters} The number of iterations to run the search. A larger +number will give more accurate results at the cost of a longer computation +time. By default, this will be chosen based on the number of observations +in \code{X}. You may also need to modify the convergence criterion +\code{delta}. +\item \code{delta} The minimum relative change in the neighbor graph +allowed before early stopping. Should be a value between 0 and 1. The +smaller the value, the smaller the amount of progress between iterations is +allowed. Default value of \code{0.001} means that at least 0.1% of the +neighbor graph must be updated at each iteration. +\item \code{init} How to initialize the nearest neighbor descent. By +default this is set to \code{"tree"} and uses a random project forest. +If you set this to \code{"rand"}, then a random selection is used. Usually +this is less accurate than using RP trees, but for high-dimensional cases, +there may be little difference in the quality of the initialization and +random initialization will be a lot faster. If you set this to +\code{"rand"}, then the \code{n_trees} parameter is ignored. }} } \value{ @@ -481,8 +530,57 @@ iris_lvish <- lvish(iris, perplexity = 50, learning_rate = 0.5, init = "random", n_epochs = 20 ) + } \references{ +Belkin, M., & Niyogi, P. (2002). +Laplacian eigenmaps and spectral techniques for embedding and clustering. +In \emph{Advances in neural information processing systems} +(pp. 585-591). +\url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} + +Böhm, J. N., Berens, P., & Kobak, D. (2020). +A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. +\emph{arXiv preprint} \emph{arXiv:2007.08902}. +\url{https://arxiv.org/abs/2007.08902} + +Damrich, S., & Hamprecht, F. A. (2021). +On UMAP's true loss function. +\emph{Advances in Neural Information Processing Systems}, \emph{34}. +\url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} + +Dong, W., Moses, C., & Li, K. (2011, March). +Efficient k-nearest neighbor graph construction for generic similarity measures. +In \emph{Proceedings of the 20th international conference on World Wide Web} +(pp. 577-586). +ACM. +\doi{10.1145/1963405.1963487}. + +Kingma, D. P., & Ba, J. (2014). +Adam: A method for stochastic optimization. +\emph{arXiv preprint} \emph{arXiv}:1412.6980. +\url{https://arxiv.org/abs/1412.6980} + +Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). +Multi-scale similarities in stochastic neighbour embedding: Reducing +dimensionality while preserving both local and global structure. +\emph{Neurocomputing}, \emph{169}, 246-261. + +Malkov, Y. A., & Yashunin, D. A. (2018). +Efficient and robust approximate nearest neighbor search using hierarchical +navigable small world graphs. +\emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. + +McInnes, L., Healy, J., & Melville, J. (2018). +UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction +\emph{arXiv preprint} \emph{arXiv}:1802.03426. +\url{https://arxiv.org/abs/1802.03426} + +O’Neill, M. E. (2014). +\emph{PCG: A family of simple fast space-efficient statistically good +algorithms for random number generation} +(Report No. HMC-CS-2014-0905). Harvey Mudd College. + Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} @@ -490,8 +588,13 @@ In \emph{Proceedings of the 25th International Conference on World Wide Web} International World Wide Web Conferences Steering Committee. \url{https://arxiv.org/abs/1602.00370} -Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). -Multi-scale similarities in stochastic neighbour embedding: Reducing -dimensionality while preserving both local and global structure. -\emph{Neurocomputing}, \emph{169}, 246-261. +Van der Maaten, L., & Hinton, G. (2008). +Visualizing data using t-SNE. +\emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). +\url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} + +Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). +Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. +\emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. +\url{https://www.jmlr.org/papers/v22/20-1061.html} } diff --git a/man/optimize_graph_layout.Rd b/man/optimize_graph_layout.Rd index 40eda55d..2278cb3b 100644 --- a/man/optimize_graph_layout.Rd +++ b/man/optimize_graph_layout.Rd @@ -292,6 +292,11 @@ UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} +O’Neill, M. E. (2014). +\emph{PCG: A family of simple fast space-efficient statistically good +algorithms for random number generation} +(Report No. HMC-CS-2014-0905). Harvey Mudd College. + Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). Visualizing large-scale and high-dimensional data. In \emph{Proceedings of the 25th International Conference on World Wide Web} diff --git a/man/similarity_graph.Rd b/man/similarity_graph.Rd index e257c8aa..b315713e 100644 --- a/man/similarity_graph.Rd +++ b/man/similarity_graph.Rd @@ -65,7 +65,19 @@ more local data being preserved. In general values should be in the range \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -distance metric is always "euclidean"). +distance metric is always "euclidean"). If +\href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +and \code{nn_method = "hnsw"} is specified then only the following metrics +are available: +\itemize{ + \item \code{"euclidean"} + \item \code{"cosine"} + \item \code{"correlation"} +} +If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +installed and \code{nn_method = "nndescent"} is specified then many more +metrics are avaiable. For more details see the package documentation of +\code{rnndescent}. If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in @@ -127,13 +139,21 @@ dimension of the manifold. Ignored if \code{method = "largevis"}.} \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. - \item \code{"hnsw"} Use approximate nearest neighbors via the + \item \code{"hnsw"} Use approximate nearest neighbors with the + Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, + 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is - only available if you have installed \code{RcppHNSW} yourself. - Also, HNSW only supports the following arguments for \code{metric} and + only available if you have installed \code{RcppHNSW} yourself. Also, + HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. + \item \code{"nndescent"} Use approximate nearest neighbors with the + Nearest Neighbor Descent method (Dong et al., 2011) via the + \href{https://cran.r-project.org/package=rnndescent}{rnndescent} + package. \code{rnndescent} is not a dependency of this package: this + option is only available if you have installed \code{rnndescent} + yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -330,6 +350,54 @@ Default is \code{200}. list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. +} +For \code{nn_method = "nndescent"}, you may specify the following +arguments: +\itemize{ +\item \code{n_trees} The number of trees to use in a random projection +forest to initialize the search. A larger number will give more accurate +results at the cost of a longer computation time. The default of +\code{NULL} means that the number is chosen based on the number of +observations in \code{X}. +\item \code{max_candidates} The number of potential neighbors to explore +per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +whichever is smaller. A larger number will give more accurate results at +the cost of a longer computation time. +\item \code{n_iters} The number of iterations to run the search. A larger +number will give more accurate results at the cost of a longer computation +time. By default, this will be chosen based on the number of observations +in \code{X}. You may also need to modify the convergence criterion +\code{delta}. +\item \code{delta} The minimum relative change in the neighbor graph +allowed before early stopping. Should be a value between 0 and 1. The +smaller the value, the smaller the amount of progress between iterations is +allowed. Default value of \code{0.001} means that at least 0.1% of the +neighbor graph must be updated at each iteration. +\item \code{init} How to initialize the nearest neighbor descent. By +default this is set to \code{"tree"} and uses a random project forest. +If you set this to \code{"rand"}, then a random selection is used. Usually +this is less accurate than using RP trees, but for high-dimensional cases, +there may be little difference in the quality of the initialization and +random initialization will be a lot faster. If you set this to +\code{"rand"}, then the \code{n_trees} parameter is ignored. +\item \code{pruning_degree_multiplier} The maximum number of edges per node +to retain in the search graph, relative to \code{n_neighbors}. A larger +value will give more accurate results at the cost of a longer computation +time. Default is \code{1.5}. This parameter only affects neighbor search +when transforming new data with \code{\link{umap_transform}}. +\item \code{epsilon} Controls the degree of the back-tracking when +traversing the search graph. Setting this to \code{0.0} will do a greedy +search with no back-tracking. A larger value will give more accurate +results at the cost of a longer computation time. Default is \code{0.1}. +This parameter only affects neighbor search when transforming new data with +\code{\link{umap_transform}}. +\item \code{max_search_fraction} Specifies the maximum fraction of the +search graph to traverse. By default, this is set to \code{1.0}, so the +entire graph (i.e. all items in \code{X}) may be visited. You may want to +set this to a smaller value if you have a very large dataset (in +conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +of the data in \code{X}. This parameter only affects neighbor search when +transforming new data with \code{\link{umap_transform}}. }} } \value{ @@ -401,6 +469,18 @@ all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph) } \references{ +Dong, W., Moses, C., & Li, K. (2011, March). +Efficient k-nearest neighbor graph construction for generic similarity measures. +In \emph{Proceedings of the 20th international conference on World Wide Web} +(pp. 577-586). +ACM. +\doi{10.1145/1963405.1963487}. + +Malkov, Y. A., & Yashunin, D. A. (2018). +Efficient and robust approximate nearest neighbor search using hierarchical +navigable small world graphs. +\emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. + McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. diff --git a/man/tumap.Rd b/man/tumap.Rd index 42af28bf..4e0be029 100644 --- a/man/tumap.Rd +++ b/man/tumap.Rd @@ -84,7 +84,19 @@ integer value in the range \code{2} to \code{100}.} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -distance metric is always "euclidean"). +distance metric is always "euclidean"). If +\href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +and \code{nn_method = "hnsw"} is specified then only the following metrics +are available: +\itemize{ + \item \code{"euclidean"} + \item \code{"cosine"} + \item \code{"correlation"} +} +If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +installed and \code{nn_method = "nndescent"} is specified then many more +metrics are avaiable. For more details see the package documentation of +\code{rnndescent}. If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in @@ -220,13 +232,21 @@ embedding.} \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. - \item \code{"hnsw"} Use approximate nearest neighbors via the + \item \code{"hnsw"} Use approximate nearest neighbors with the + Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, + 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is - only available if you have installed \code{RcppHNSW} yourself. - Also, HNSW only supports the following arguments for \code{metric} and + only available if you have installed \code{RcppHNSW} yourself. Also, + HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. + \item \code{"nndescent"} Use approximate nearest neighbors with the + Nearest Neighbor Descent method (Dong et al., 2011) via the + \href{https://cran.r-project.org/package=rnndescent}{rnndescent} + package. \code{rnndescent} is not a dependency of this package: this + option is only available if you have installed \code{rnndescent} + yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -522,6 +542,102 @@ Default is \code{200}. list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. +} +For \code{nn_method = "nndescent"}, you may specify the following +arguments: +\itemize{ +\item \code{n_trees} The number of trees to use in a random projection +forest to initialize the search. A larger number will give more accurate +results at the cost of a longer computation time. The default of +\code{NULL} means that the number is chosen based on the number of +observations in \code{X}. +\item \code{max_candidates} The number of potential neighbors to explore +per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +whichever is smaller. A larger number will give more accurate results at +the cost of a longer computation time. +\item \code{n_iters} The number of iterations to run the search. A larger +number will give more accurate results at the cost of a longer computation +time. By default, this will be chosen based on the number of observations +in \code{X}. You may also need to modify the convergence criterion +\code{delta}. +\item \code{delta} The minimum relative change in the neighbor graph +allowed before early stopping. Should be a value between 0 and 1. The +smaller the value, the smaller the amount of progress between iterations is +allowed. Default value of \code{0.001} means that at least 0.1% of the +neighbor graph must be updated at each iteration. +\item \code{init} How to initialize the nearest neighbor descent. By +default this is set to \code{"tree"} and uses a random project forest. +If you set this to \code{"rand"}, then a random selection is used. Usually +this is less accurate than using RP trees, but for high-dimensional cases, +there may be little difference in the quality of the initialization and +random initialization will be a lot faster. If you set this to +\code{"rand"}, then the \code{n_trees} parameter is ignored. +\item \code{pruning_degree_multiplier} The maximum number of edges per node +to retain in the search graph, relative to \code{n_neighbors}. A larger +value will give more accurate results at the cost of a longer computation +time. Default is \code{1.5}. This parameter only affects neighbor search +when transforming new data with \code{\link{umap_transform}}. +\item \code{epsilon} Controls the degree of the back-tracking when +traversing the search graph. Setting this to \code{0.0} will do a greedy +search with no back-tracking. A larger value will give more accurate +results at the cost of a longer computation time. Default is \code{0.1}. +This parameter only affects neighbor search when transforming new data with +\code{\link{umap_transform}}. +\item \code{max_search_fraction} Specifies the maximum fraction of the +search graph to traverse. By default, this is set to \code{1.0}, so the +entire graph (i.e. all items in \code{X}) may be visited. You may want to +set this to a smaller value if you have a very large dataset (in +conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +of the data in \code{X}. This parameter only affects neighbor search when +transforming new data with \code{\link{umap_transform}}. +} +For \code{nn_method = "nndescent"}, you may specify the following +arguments: +\itemize{ +\item \code{n_trees} The number of trees to use in a random projection +forest to initialize the search. A larger number will give more accurate +results at the cost of a longer computation time. The default of +\code{NULL} means that the number is chosen based on the number of +observations in \code{X}. +\item \code{max_candidates} The number of potential neighbors to explore +per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +whichever is smaller. A larger number will give more accurate results at +the cost of a longer computation time. +\item \code{n_iters} The number of iterations to run the search. A larger +number will give more accurate results at the cost of a longer computation +time. By default, this will be chosen based on the number of observations +in \code{X}. You may also need to modify the convergence criterion +\code{delta}. +\item \code{delta} The minimum relative change in the neighbor graph +allowed before early stopping. Should be a value between 0 and 1. The +smaller the value, the smaller the amount of progress between iterations is +allowed. Default value of \code{0.001} means that at least 0.1% of the +neighbor graph must be updated at each iteration. +\item \code{init} How to initialize the nearest neighbor descent. By +default this is set to \code{"tree"} and uses a random project forest. If +you set this to \code{"rand"}, then a random selection is used. Usually +this is less accurate than using RP trees, but for high-dimensional cases, +there may be little difference in the quality of the initialization and +random initialization will be a lot faster. If you set this to +\code{"rand"}, then the \code{n_trees} parameter is ignored. +\item \code{pruning_degree_multiplier} The maximum number of edges per node +to retain in the search graph, relative to \code{n_neighbors}. A larger +value will give more accurate results at the cost of a longer computation +time. Default is \code{1.5}. This parameter only affects neighbor search +when transforming new data with \code{\link{umap_transform}}. +\item \code{epsilon} Controls the degree of the back-tracking when +traversing the search graph. Setting this to \code{0.0} will do a greedy +search with no back-tracking. A larger value will give more accurate +results at the cost of a longer computation time. Default is \code{0.1}. +This parameter only affects neighbor search when transforming new data with +\code{\link{umap_transform}}. +\item \code{max_search_fraction} Specifies the maximum fraction of the +search graph to traverse. By default, this is set to \code{1.0}, so the +entire graph (i.e. all items in \code{X}) may be visited. You may want to +set this to a smaller value if you have a very large dataset (in +conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +of the data in \code{X}. This parameter only affects neighbor search when +transforming new data with \code{\link{umap_transform}}. }} } \value{ @@ -554,15 +670,78 @@ A matrix of optimized coordinates, or: specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}. } \description{ -A faster (but less flexible) version of the UMAP gradient. For more detail on -UMAP, see the \code{\link{umap}} function. +A faster (but less flexible) version of the UMAP (McInnes et al, 2018) +gradient. For more detail on UMAP, see the \code{\link{umap}} function. } \details{ By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you -get back the Cauchy distribution as used in t-SNE and LargeVis. It also -results in a substantially simplified gradient expression. This can give -a speed improvement of around 50\%. +get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton, +2008) and LargeVis (Tang et al., 2016). It also results in a substantially +simplified gradient expression. This can give a speed improvement of around +50\%. } \examples{ iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5) + +} +\references{ +Belkin, M., & Niyogi, P. (2002). +Laplacian eigenmaps and spectral techniques for embedding and clustering. +In \emph{Advances in neural information processing systems} +(pp. 585-591). +\url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf} + +Böhm, J. N., Berens, P., & Kobak, D. (2020). +A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum. +\emph{arXiv preprint} \emph{arXiv:2007.08902}. +\url{https://arxiv.org/abs/2007.08902} + +Damrich, S., & Hamprecht, F. A. (2021). +On UMAP's true loss function. +\emph{Advances in Neural Information Processing Systems}, \emph{34}. +\url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} + +Dong, W., Moses, C., & Li, K. (2011, March). +Efficient k-nearest neighbor graph construction for generic similarity measures. +In \emph{Proceedings of the 20th international conference on World Wide Web} +(pp. 577-586). +ACM. +\doi{10.1145/1963405.1963487}. + +Kingma, D. P., & Ba, J. (2014). +Adam: A method for stochastic optimization. +\emph{arXiv preprint} \emph{arXiv}:1412.6980. +\url{https://arxiv.org/abs/1412.6980} + +Malkov, Y. A., & Yashunin, D. A. (2018). +Efficient and robust approximate nearest neighbor search using hierarchical +navigable small world graphs. +\emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. + +McInnes, L., Healy, J., & Melville, J. (2018). +UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction +\emph{arXiv preprint} \emph{arXiv}:1802.03426. +\url{https://arxiv.org/abs/1802.03426} + +O’Neill, M. E. (2014). +\emph{PCG: A family of simple fast space-efficient statistically good +algorithms for random number generation} +(Report No. HMC-CS-2014-0905). Harvey Mudd College. + +Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). +Visualizing large-scale and high-dimensional data. +In \emph{Proceedings of the 25th International Conference on World Wide Web} +(pp. 287-297). +International World Wide Web Conferences Steering Committee. +\url{https://arxiv.org/abs/1602.00370} + +Van der Maaten, L., & Hinton, G. (2008). +Visualizing data using t-SNE. +\emph{Journal of Machine Learning Research}, \emph{9} (2579-2605). +\url{https://www.jmlr.org/papers/v9/vandermaaten08a.html} + +Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021). +Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization. +\emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73. +\url{https://www.jmlr.org/papers/v22/20-1061.html} } diff --git a/man/umap.Rd b/man/umap.Rd index d8f6e124..016d9288 100644 --- a/man/umap.Rd +++ b/man/umap.Rd @@ -90,8 +90,19 @@ integer value in the range \code{2} to \code{100}.} \item \code{"categorical"} (see below) } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the -distance metric is always "euclidean"). - +distance metric is always "euclidean"). If +\href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is installed +and \code{nn_method = "hnsw"} is specified then only the following metrics +are available: +\itemize{ + \item \code{"euclidean"} + \item \code{"cosine"} + \item \code{"correlation"} +} +If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is +installed and \code{nn_method = "nndescent"} is specified then many more +metrics are avaiable. For more details see the package documentation of +\code{rnndescent}. If \code{X} is a data frame or matrix, then multiple metrics can be specified, by passing a list to this argument, where the name of each item in the list is one of the metric names above. The value of each list item should @@ -245,13 +256,21 @@ these values are set automatically as determined by \code{min_dist} and \href{https://cran.r-project.org/package=FNN}{FNN} package. \item \code{"annoy"} Use approximate nearest neighbors via the \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package. - \item \code{"hnsw"} Use approximate nearest neighbors via the + \item \code{"hnsw"} Use approximate nearest neighbors with the + Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin, + 2018) via the \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package. \code{RcppHNSW} is not a dependency of this package: this option is - only available if you have installed \code{RcppHNSW} yourself. Only - Also, HNSW only supports the following arguments for \code{metric} and + only available if you have installed \code{RcppHNSW} yourself. Also, + HNSW only supports the following arguments for \code{metric} and \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}. + \item \code{"nndescent"} Use approximate nearest neighbors with the + Nearest Neighbor Descent method (Dong et al., 2011) via the + \href{https://cran.r-project.org/package=rnndescent}{rnndescent} + package. \code{rnndescent} is not a dependency of this package: this + option is only available if you have installed \code{rnndescent} + yourself. } By default, if \code{X} has less than 4,096 vertices, the exact nearest neighbors are found. Otherwise, approximate nearest neighbors are used. @@ -560,6 +579,54 @@ Default is \code{200}. list used during search. This cannot be smaller than \code{n_neighbors} and cannot be higher than the number of items in the index. Default is \code{10}. +} +For \code{nn_method = "nndescent"}, you may specify the following +arguments: +\itemize{ +\item \code{n_trees} The number of trees to use in a random projection +forest to initialize the search. A larger number will give more accurate +results at the cost of a longer computation time. The default of +\code{NULL} means that the number is chosen based on the number of +observations in \code{X}. +\item \code{max_candidates} The number of potential neighbors to explore +per iteration. By default, this is set to \code{n_neighbors} or \code{60}, +whichever is smaller. A larger number will give more accurate results at +the cost of a longer computation time. +\item \code{n_iters} The number of iterations to run the search. A larger +number will give more accurate results at the cost of a longer computation +time. By default, this will be chosen based on the number of observations +in \code{X}. You may also need to modify the convergence criterion +\code{delta}. +\item \code{delta} The minimum relative change in the neighbor graph +allowed before early stopping. Should be a value between 0 and 1. The +smaller the value, the smaller the amount of progress between iterations is +allowed. Default value of \code{0.001} means that at least 0.1% of the +neighbor graph must be updated at each iteration. +\item \code{init} How to initialize the nearest neighbor descent. By +default this is set to \code{"tree"} and uses a random project forest. +If you set this to \code{"rand"}, then a random selection is used. Usually +this is less accurate than using RP trees, but for high-dimensional cases, +there may be little difference in the quality of the initialization and +random initialization will be a lot faster. If you set this to +\code{"rand"}, then the \code{n_trees} parameter is ignored. +\item \code{pruning_degree_multiplier} The maximum number of edges per node +to retain in the search graph, relative to \code{n_neighbors}. A larger +value will give more accurate results at the cost of a longer computation +time. Default is \code{1.5}. This parameter only affects neighbor search +when transforming new data with \code{\link{umap_transform}}. +\item \code{epsilon} Controls the degree of the back-tracking when +traversing the search graph. Setting this to \code{0.0} will do a greedy +search with no back-tracking. A larger value will give more accurate +results at the cost of a longer computation time. Default is \code{0.1}. +This parameter only affects neighbor search when transforming new data with +\code{\link{umap_transform}}. +\item \code{max_search_fraction} Specifies the maximum fraction of the +search graph to traverse. By default, this is set to \code{1.0}, so the +entire graph (i.e. all items in \code{X}) may be visited. You may want to +set this to a smaller value if you have a very large dataset (in +conjunction with \code{epsilon}) to avoid an inefficient exhaustive search +of the data in \code{X}. This parameter only affects neighbor search when +transforming new data with \code{\link{umap_transform}}. }} } \value{ @@ -593,7 +660,7 @@ A matrix of optimized coordinates, or: } \description{ Carry out dimensionality reduction of a dataset using the Uniform Manifold -Approximation and Projection (UMAP) method (McInnes & Healy, 2018). Some of +Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of the following help text is lifted verbatim from the Python reference implementation at \url{https://github.com/lmcinnes/umap}. } @@ -643,12 +710,24 @@ On UMAP's true loss function. \emph{Advances in Neural Information Processing Systems}, \emph{34}. \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html} +Dong, W., Moses, C., & Li, K. (2011, March). +Efficient k-nearest neighbor graph construction for generic similarity measures. +In \emph{Proceedings of the 20th international conference on World Wide Web} +(pp. 577-586). +ACM. +\doi{10.1145/1963405.1963487}. + Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \emph{arXiv preprint} \emph{arXiv}:1412.6980. \url{https://arxiv.org/abs/1412.6980} -McInnes, L., & Healy, J. (2018). +Malkov, Y. A., & Yashunin, D. A. (2018). +Efficient and robust approximate nearest neighbor search using hierarchical +navigable small world graphs. +\emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836. + +McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction \emph{arXiv preprint} \emph{arXiv}:1802.03426. \url{https://arxiv.org/abs/1802.03426} diff --git a/tests/testthat/test_neighbors.R b/tests/testthat/test_neighbors.R index b1c32e33..3fb8b924 100644 --- a/tests/testthat/test_neighbors.R +++ b/tests/testthat/test_neighbors.R @@ -475,3 +475,304 @@ test_that("hnsw gives correct correlation neighbor results and multiple threads" tol = 1e-6 ) }) + +# rnndescent + +test_that("nndescent gives correct euclidean neighbor results", { + testthat::skip_if_not_installed("rnndescent") + iris10_annoy <- + umap( + iris10, + n_neighbors = 4, + nn_method = "annoy", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0 + ) + iris10_nnd_no_model <- + umap( + iris10, + n_neighbors = 4, + nn_method = "nndescent", + ret_extra = c("nn"), + ret_model = FALSE, + n_epochs = 0 + ) + expect_equal(iris10_annoy$nn$euclidean$idx, + iris10_nnd_no_model$nn$euclidean$idx, + check.attributes = FALSE) + expect_equal(iris10_annoy$nn$euclidean$dist, + iris10_nnd_no_model$nn$euclidean$dist, + check.attributes = FALSE, + tol = 1e-7) + + iris10_nnd <- + umap( + iris10, + n_neighbors = 4, + nn_method = "nndescent", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0 + ) + expect_equal(iris10_annoy$nn$euclidean$idx, + iris10_nnd$nn$euclidean$idx, + check.attributes = FALSE) + expect_equal(iris10_annoy$nn$euclidean$dist, + iris10_nnd$nn$euclidean$dist, + check.attributes = FALSE, + tol = 1e-7) + + iris10_transform_nnd <- + umap_transform(iris10, + iris10_nnd, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_nnd$nn$euclidean$idx, + iris10_transform_nnd$nn$euclidean$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_nnd$nn$euclidean$dist, + iris10_transform_nnd$nn$euclidean$dist, + check.attributes = FALSE + ) + + iris10_transform_annoy <- + umap_transform(iris10, + iris10_annoy, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_transform_annoy$nn$euclidean$idx, + iris10_transform_nnd$nn$euclidean$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_transform_annoy$nn$euclidean$dist, + iris10_transform_nnd$nn$euclidean$dist, + check.attributes = FALSE, + tol = 1e-6 + ) +}) + +test_that("nndescent gives correct cosine neighbor results", { + testthat::skip_if_not_installed("rnndescent") + iris10_annoy <- + umap( + iris10, + n_neighbors = 4, + nn_method = "annoy", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "cosine" + ) + iris10_nnd <- + umap( + iris10, + n_neighbors = 4, + nn_method = "nndescent", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "cosine" + ) + expect_equal(iris10_annoy$nn$cosine$idx, + iris10_nnd$nn$cosine$idx, + check.attributes = FALSE) + expect_equal(iris10_annoy$nn$cosine$dist, + iris10_nnd$nn$cosine$dist, + check.attributes = FALSE, + tol = 1e-6) + + iris10_transform_nnd <- + umap_transform(iris10, + iris10_nnd, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_nnd$nn$cosine$idx, + iris10_transform_nnd$nn$cosine$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_nnd$nn$cosine$dist, + iris10_transform_nnd$nn$cosine$dist, + check.attributes = FALSE + ) + + iris10_transform_annoy <- + umap_transform(iris10, + iris10_annoy, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_transform_annoy$nn$cosine$idx, + iris10_transform_nnd$nn$cosine$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_transform_annoy$nn$cosine$dist, + iris10_transform_nnd$nn$cosine$dist, + check.attributes = FALSE, + tol = 1e-6 + ) +}) + +test_that("nndescent gives correct correlation neighbor results", { + testthat::skip_if_not_installed("rnndescent") + iris10_annoy <- + umap( + iris10, + n_neighbors = 4, + nn_method = "annoy", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "correlation" + ) + iris10_nnd <- + umap( + iris10, + n_neighbors = 4, + nn_method = "nndescent", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "correlation" + ) + expect_equal(iris10_annoy$nn$correlation$idx, + iris10_nnd$nn$correlation$idx, + check.attributes = FALSE) + expect_equal(iris10_annoy$nn$correlation$dist, + iris10_nnd$nn$correlation$dist, + check.attributes = FALSE, + tol = 1e-6) + + iris10_transform_nnd <- + umap_transform(iris10, + iris10_nnd, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_nnd$nn$correlation$idx, + iris10_transform_nnd$nn$correlation$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_nnd$nn$correlation$dist, + iris10_transform_nnd$nn$correlation$dist, + check.attributes = FALSE + ) + + iris10_transform_annoy <- + umap_transform(iris10, + iris10_annoy, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_transform_annoy$nn$correlation$idx, + iris10_transform_nnd$nn$correlation$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_transform_annoy$nn$correlation$dist, + iris10_transform_nnd$nn$correlation$dist, + check.attributes = FALSE, + tol = 1e-6 + ) +}) + +test_that("nndescent gives correct correlation neighbor results and multiple threads", { + testthat::skip_if_not_installed("rnndescent") + iris10_annoy <- + umap( + iris10, + n_neighbors = 4, + nn_method = "annoy", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "correlation" + ) + iris10_nnd <- + umap( + iris10, + n_neighbors = 4, + nn_method = "nndescent", + ret_extra = c("nn"), + ret_model = TRUE, + n_epochs = 0, + metric = "correlation", + n_threads = 2 + ) + expect_equal(iris10_annoy$nn$correlation$idx, + iris10_nnd$nn$correlation$idx, + check.attributes = FALSE) + expect_equal(iris10_annoy$nn$correlation$dist, + iris10_nnd$nn$correlation$dist, + check.attributes = FALSE, + tol = 1e-6) + + iris10_transform_nnd <- + umap_transform(iris10, + iris10_nnd, + n_epochs = 0, + ret_extra = c("nn"), + n_threads = 2) + expect_equal( + iris10_nnd$nn$correlation$idx, + iris10_transform_nnd$nn$correlation$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_nnd$nn$correlation$dist, + iris10_transform_nnd$nn$correlation$dist, + check.attributes = FALSE + ) + + iris10_transform_annoy <- + umap_transform(iris10, + iris10_annoy, + n_epochs = 0, + ret_extra = c("nn")) + expect_equal( + iris10_transform_annoy$nn$correlation$idx, + iris10_transform_nnd$nn$correlation$idx, + check.attributes = FALSE + ) + expect_equal( + iris10_transform_annoy$nn$correlation$dist, + iris10_transform_nnd$nn$correlation$dist, + check.attributes = FALSE, + tol = 1e-6 + ) + + model_with_args <- umap( + iris10, + n_neighbors = 4, + n_epochs = 2, + init = "spca", + metric = "euclidean", + verbose = FALSE, + n_threads = 0, + ret_model = TRUE, + ret_extra = c("nn"), + nn_method = "nndescent", + nn_args = list( + init = "rand", + prune_reverse = TRUE, + epsilon = 0.0 + ) + ) + expect_equal( + model_with_args$nn_args, + list( + init = "rand", + prune_reverse = TRUE, + epsilon = 0 + ) + ) +}) diff --git a/tests/testthat/test_saveload.R b/tests/testthat/test_saveload.R index 7445222e..c7cff780 100644 --- a/tests/testthat/test_saveload.R +++ b/tests/testthat/test_saveload.R @@ -249,3 +249,92 @@ test_that("save-load hnsw", { transformed_after_reload$embedding ) }) + +test_that("save-load nndescent", { + testthat::skip_if_not_installed("rnndescent") + set.seed(1337) + model <- umap(iris10, + n_neighbors = 4, n_epochs = 2, init = "spca", + metric = "euclidean", verbose = FALSE, n_threads = 0, + ret_model = TRUE, ret_extra = c("nn"), nn_method = "nndescent" + ) + expect_equal(model$nn_method, "nndescent") + + set.seed(1337) + transformed_before_reload <- + umap_transform(iris10, + model, + n_epochs = 2, + ret_extra = c("nn") + ) + + mod_fname <- tempfile(tmpdir = tempdir()) + model <- save_uwot(model, file = mod_fname, unload = TRUE) + + modelload <- load_uwot(file = mod_fname) + + expect_equal(modelload$nn_method, "nndescent") + + set.seed(1337) + transformed_after_reload <- + umap_transform(iris10, + modelload, + n_epochs = 2, + ret_extra = c("nn") + ) + + if (file.exists(mod_fname)) { + unlink(mod_fname) + } + expect_true(file.exists(modelload$mod_dir)) + unload_uwot(modelload) + expect_false(file.exists(modelload$mod_dir)) + + expect_equal(model$nn$euclidean$idx, modelload$nn$euclidean$idx) + expect_equal(model$nn$euclidean$dist, modelload$nn$euclidean$dist) + + expect_equal( + transformed_before_reload$nn$euclidean$idx, + transformed_after_reload$nn$euclidean$idx, + ) + expect_equal( + transformed_before_reload$nn$euclidean$dist, + transformed_after_reload$nn$euclidean$dist, + check.attributes = FALSE, + tol = 1e-7 + ) + + expect_equal( + transformed_before_reload$embedding, + transformed_after_reload$embedding + ) + + mod_fname2 <- tempfile(tmpdir = tempdir()) + saveRDS(modelload, mod_fname2) + modelload2 <- readRDS(mod_fname2) + expect_equal(modelload2$nn_method, "nndescent") + set.seed(1337) + transformed_after_reload2 <- + umap_transform(iris10, + modelload2, + n_epochs = 2, + ret_extra = c("nn") + ) + expect_equal( + transformed_after_reload$nn$euclidean$idx, + transformed_after_reload2$nn$euclidean$idx, + ) + expect_equal( + transformed_after_reload$nn$euclidean$dist, + transformed_after_reload2$nn$euclidean$dist, + check.attributes = FALSE, + tol = 1e-7 + ) + expect_equal( + transformed_after_reload$embedding, + transformed_after_reload2$embedding + ) + if (file.exists(mod_fname2)) { + unlink(mod_fname2) + } +})