diff --git a/DESCRIPTION b/DESCRIPTION index 9dde0524..0b57a7d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Encoding: UTF-8 Type: Package Package: fastTopics -Version: 0.5-22 +Version: 0.5-23 Date: 2021-03-04 Title: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data diff --git a/NAMESPACE b/NAMESPACE index 263ce61a..e9a2c33b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ S3method(print,summary.poisson_nmf_fit) S3method(select,multinom_topic_model_fit) S3method(summary,multinom_topic_model_fit) S3method(summary,poisson_nmf_fit) -export(compare_poisson_nmf_fits) +export(compare_fits) export(cost) export(deviance_poisson_nmf) export(diff_count_analysis) @@ -30,7 +30,7 @@ export(pca_hexbin_plot_ggplot_call) export(pca_plot) export(pca_plot_ggplot_call) export(plot_loglik_vs_rank) -export(plot_progress_poisson_nmf) +export(plot_progress) export(poisson2multinom) export(select_loadings) export(simulate_count_data) diff --git a/R/fit_poisson_nmf.R b/R/fit_poisson_nmf.R index 3798572b..ded3cc4d 100644 --- a/R/fit_poisson_nmf.R +++ b/R/fit_poisson_nmf.R @@ -61,8 +61,8 @@ #' (KKT) first-order conditions. As the iterates approach a stationary #' point of the loss function, the change in the model parameters #' should be small, and the residuals of the KKT system should vanish. -#' Use \code{\link{plot_progress_poisson_nmf}} to plot the improvement -#' in the solution over time. +#' Use \code{\link{plot_progress}} to plot the improvement in the +#' solution over time. #' #' See \code{\link{fit_topic_model}} for additional guidance on model #' fitting, particularly for large or complex data sets. @@ -238,21 +238,21 @@ #' \item{progress}{A data frame containing detailed information about #' the algorithm's progress. The data frame should have \code{numiter} #' rows. The columns of the data frame are: "iter", the iteration -#' number; "loglik", the (Poisson) log-likelihood at the current best -#' factor and loading estimates; "dev", the deviance at the current -#' best factor and loading estimates; "res", the maximum residual of -#' the Karush-Kuhn-Tucker (KKT) first-order optimality conditions at -#' the current best factor and loading estimates; "loglik.multinom", -#' the multinomial log-likelihood at the current best factor and -#' loading estimates; "delta.f", the largest change in the factors -#' matrix; "delta.l", the largest change in the loadings matrix; -#' "nonzeros.f", the proportion of entries in the factors matrix that -#' are nonzero; "nonzeros.l", the proportion of entries in the -#' loadings matrix that are nonzero; "extrapolate", which is 1 if -#' extrapolation is used, otherwise it is 0; "beta", the setting of -#' the extrapolation parameter; "betamax", the setting of the -#' extrapolation parameter upper bound; and "timing", the elapsed time -#' in seconds (recorded using \code{\link{proc.time}}).} +#' number; "loglik", the Poisson NMF log-likelihood at the current +#' best factor and loading estimates; "dev", the deviance at the +#' current best factor and loading estimates; "res", the maximum +#' residual of the Karush-Kuhn-Tucker (KKT) first-order optimality +#' conditions at the current best factor and loading estimates; +#' "loglik.multinom", the multinomial topic model log-likelihood at +#' the current best factor and loading estimates; "delta.f", the +#' largest change in the factors matrix; "delta.l", the largest change +#' in the loadings matrix; "nonzeros.f", the proportion of entries in +#' the factors matrix that are nonzero; "nonzeros.l", the proportion +#' of entries in the loadings matrix that are nonzero; "extrapolate", +#' which is 1 if extrapolation is used, otherwise it is 0; "beta", the +#' setting of the extrapolation parameter; "betamax", the setting of +#' the extrapolation parameter upper bound; and "timing", the elapsed +#' time in seconds (recorded using \code{\link{proc.time}}).} #' #' @references #' @@ -309,9 +309,9 @@ #' #' # Compare the two fits. #' fits <- list(em = fit_em,scd = fit_scd) -#' compare_poisson_nmf_fits(fits) -#' plot_progress_poisson_nmf(fits,y = "loglik") -#' plot_progress_poisson_nmf(fits,y = "res") +#' compare_fits(fits) +#' plot_progress(fits,y = "loglik") +#' plot_progress(fits,y = "res") #' #' # Recover the topic model. After this step, the L matrix contains the #' # mixture proportions ("loadings"), and the F matrix contains the @@ -422,7 +422,7 @@ fit_poisson_nmf <- function (X, k, fit0, numiter = 100, method.text <- "CCD" cat(sprintf("Running %d %s updates, %s extrapolation ",numiter, method.text,ifelse(control$extrapolate,"with","without"))) - cat("(fastTopics 0.5-22).\n") + cat("(fastTopics 0.5-23).\n") } # INITIALIZE ESTIMATES diff --git a/R/likelihood.R b/R/likelihood.R index 948d86f3..451c23dc 100644 --- a/R/likelihood.R +++ b/R/likelihood.R @@ -5,9 +5,10 @@ #' @description Compute log-likelihoods and deviances for assessing #' fit of a topic model or a non-negative matrix factorization (NMF). #' -#' @details Function \code{cost} is mainly for internal use to quickly -#' compute log-likelihoods and deviances; it should not be used -#' directly unless you know what you are doing. In particular, very +#' @details Function \code{cost} computes loss functions proportional +#' to the negative log-likelihoods, and is mainly for internal use to +#' quickly compute log-likelihoods and deviances; it should not be +#' used directly unless you know what you are doing. In particular, #' little argument checking is performed by \code{cost}. #' #' @param X The n x m matrix of counts or pseudocounts. It can be a @@ -16,7 +17,7 @@ #' #' @param fit A Poisson NMF or multinomial topic model fit, such as an #' output from \code{\link{fit_poisson_nmf}} or -#' \code{\link{fit_topic_model}}. +#' \code{\link{fit_topic_model}}. #' #' @param A The n x k matrix of loadings. It should be a dense matrix. #' @@ -27,14 +28,14 @@ #' numerical problems at the cost of introducing a very small #' inaccuracy in the computation. #' -#' @param family If \code{model = "poisson"}, the loss function -#' corresponding to the Poisson non-negative matrix factorization is -#' computed; if \code{model = "multinom"}, multinomial topic model -#' loss function values are returned. See "Value" for details. +#' @param family If \code{model = "poisson"}, the loss function values +#' corresponding to the Poisson non-negative matrix factorization are +#' computed; if \code{model = "multinom"}, the multinomial topic model +#' loss function values are returned. #' #' @param version When \code{version == "R"}, the computations are #' performed entirely in R; when \code{version == "Rcpp"}, an Rcpp -#' implementation is used. The R version is typically faster when +#' implementation is used. The R version is typically faster when #' \code{X} is a dense matrix, whereas the Rcpp version is faster and #' more memory-efficient when \code{X} is a large, sparse matrix. When #' not specified, the most suitable version is called depending on @@ -194,10 +195,10 @@ poisson_nmf_kkt <- function (X, F, L, e = 1e-8) { } # Given a Poisson non-negative matrix factorization (F, L), compute -# the log-likelihoods for the "size factors"; that is, the -# log-likelihood for t ~ Poisson(s), where t = sum(x) is the total sum -# of the counts. The size factors, s, are recovered from the -# poisson2multinom transformation. +# the log-likelihoods for the "size factors"; that is, each vetor +# element is a log-likelihood for the model t ~ Poisson(s), where +# t = sum(x) is the total sum of the counts. The size factors, s, are +# recovered from the poisson2multinom transformation. # #' @importFrom stats dpois loglik_size_factors <- function (X, F, L) diff --git a/R/plots.R b/R/plots.R index 108d4ed8..783fa2e8 100644 --- a/R/plots.R +++ b/R/plots.R @@ -1,23 +1,24 @@ -#' @title Plot Progress of Poisson NMF Optimization Over Time +#' @title Plot Progress of Model Fitting Over Time #' #' @description Create a plot showing improvement in one or more -#' Poisson NMF model fits over time. The horizontal axis shows the -#' recorded runtime (in s), and the vertical axis shows some quantity -#' measuring the quality of the fit: the log-likelihood, deviance or -#' maximum residual of the Karush-Kuhn-Tucker (KKT) first-order -#' optimality conditions. To better visualize log-likelihoods and -#' deviances, the log-likelihood and deviance differences are shown on -#' the logarithmic scale. where the differences are calculated with -#' respect to the best value achieved over all the fits compared. +#' Poisson NMF or multinomial topic model fits over time. #' -#' @details Note that only minimal argument checking is performed. +#' @details The horizontal axis shows the recorded runtime (in s), and +#' the vertical axis shows some quantity measuring the quality of the +#' fit: the log-likelihood, deviance or maximum residual of the +#' Karush-Kuhn-Tucker (KKT) first-order optimality conditions. To +#' better visualize log-likelihoods and deviances, log-likelihood and +#' deviance differences are shown on the logarithmic scale. +#' Differences are calculated with respect to the best value achieved +#' over all the fits compared. +#' +#' Note that only minimal argument checking is performed. #' #' @param fits An object of class \code{"poisson_nmf_fit"} or #' \code{"multinom_topic_model_fit"}, or a non-empty, named list in -#' which each list element is an object of class -#' \code{"poisson_nmf_fit"} or \code{"multinom_topic_model_fit"}. -#' Multinomial topic model fits are automatically converted to -#' equivalent Poisson NMF fits using \code{\link{multinom2poisson}}. +#' which each all list elements are objects of class +#' \code{"poisson_nmf_fit"} or all objects of class +#' \code{"multinom_topic_model_fit"}. #' #' @param x Choose \code{"timing"} to plot improvement in the solution #' over time, or choose \code{"iter"} to plot improvement in the @@ -25,8 +26,10 @@ #' #' @param y Column of the "progress" data frame used to assess #' progress of the Poisson NMF optimization method(s). Should be one -#' of \code{"loglik"} (log-likelihood), \code{"dev"} (deviance) or -#' \code{"res"} (maximum residual of KKT conditions). +#' of \code{"loglik"} (Poisson NMF or multinomial topic model +#' log-likelihood), \code{"dev"} (deviance) or \code{"res"} (maximum +#' residual of KKT conditions). The deviance is only valid for Poisson +#' NMF model fits. #' #' @param add.point.every A positive integer giving the iteration #' interval for drawing points on the progress curves. Set to @@ -81,7 +84,7 @@ #' #' @export #' -plot_progress_poisson_nmf <- +plot_progress <- function (fits, x = c("timing","iter"), y = c("loglik","dev","res"), add.point.every = 20, colors = c("#E69F00","#56B4E9","#009E73","#F0E442","#0072B2", @@ -99,13 +102,13 @@ plot_progress_poisson_nmf <- } else { msg <- paste("Input argument \"fits\" should either be an object of", "class \"poisson_nmf_fit\" or \"multinom_topic_model_fit\"", - "or a non-empty, named list in which each list element is", - "an object of class \"poisson_nmf_fit\" or", + "or a non-empty, named list in which all list elements are", + "objects of class \"poisson_nmf_fit\" or all of class", "\"multinom_topic_model_fit\"") if (!(is.list(fits) & !is.null(names(fits)) & length(fits) > 0)) stop(msg) - if (!all(sapply(fits,function (x) inherits(x,"poisson_nmf_fit") | - inherits(x,"multinom_topic_model_fit")))) + if (!(all(sapply(fits,function(x)inherits(x,"poisson_nmf_fit"))) | + all(sapply(fits,function(x)inherits(x,"multinom_topic_model_fit"))))) stop(msg) if (!all(nchar(names(fits)) > 0)) stop(msg) @@ -114,7 +117,10 @@ plot_progress_poisson_nmf <- # Check and process input arguments "x" and "y". x <- match.arg(x) y <- match.arg(y) - + if (y == "dev" & !inherits(fits[[1]],"poisson_nmf_fit")) + stop("y = \"dev\" is only valid for Poisson NMF model fits") + if (y == "loglik" & inherits(fits[[1]],"multinom_topic_model_fit")) + y <- "loglik.multinom" # Check and process input arguments "colors", "linetypes", # "linesizes" and "shapes". n <- length(fits) @@ -139,8 +145,8 @@ plot_progress_poisson_nmf <- linesizes,shapes,fills,theme)) } -# Used by plot_progress_poisson_nmf to create a data frame suitable -# for plotting with ggplot. +# Used by plot_progress to create a data frame suitable for plotting +# with ggplot. prepare_progress_plot_data <- function (fits, e) { n <- length(fits) labels <- names(fits) @@ -150,16 +156,18 @@ prepare_progress_plot_data <- function (fits, e) { y$timing <- cumsum(y$timing) fits[[i]] <- y } - out <- do.call(rbind,fits) - out$method <- factor(out$method,labels) - out$loglik <- max(out$loglik) - out$loglik + e - out$dev <- out$dev - min(out$dev) + e + out <- do.call(rbind,fits) + out$method <- factor(out$method,labels) + out$loglik <- max(out$loglik) - out$loglik + e + out$loglik.multinom <- max(out$loglik.multinom) - out$loglik.multinom + e + out$dev <- out$dev - min(out$dev) + e return(out) } -# Used by plot_progress_poisson_nmf to create the plot. +# Used by plot_progress to create the plot. create_progress_plot <- function (pdat, x, y, add.point.every, colors, - linetypes, linesizes, shapes, fills, theme) { + linetypes, linesizes, shapes, fills, + theme) { rows <- which(pdat$iter %% add.point.every == 1) if (x == "timing") xlab <- "runtime (s)" @@ -167,8 +175,10 @@ create_progress_plot <- function (pdat, x, y, add.point.every, colors, xlab <- "iteration" if (y == "res") ylab <- "max KKT residual" - else if (y == "loglik" | y == "dev") - ylab <- paste("distance from best",y) + else if (y == "dev") + ylab <- paste("distance from best deviance") + else if (y == "loglik" | y == "loglik.multinom") + ylab <- paste("distance from best loglik") return(ggplot(pdat,aes_string(x = x,y = y,color = "method", linetype = "method",size = "method")) + geom_line(na.rm = TRUE) + @@ -188,15 +198,15 @@ create_progress_plot <- function (pdat, x, y, add.point.every, colors, #' @rdname plot_loglik_vs_rank #' -#' @title Plot Log-Likelihood Versus Poisson NMF Rank +#' @title Plot Log-Likelihood Versus Rank #' -#' @description Create a plot showing the improvement in the Poisson -#' NMF log-likelihood as the rank of the matrix factorization or the +#' @description Create a plot showing the improvement in the +#' log-likelihood as the rank of the matrix factorization or the #' number of topics (\dQuote{k}) increases. #' #' @param fits A list with 2 more list elements, in which each list #' element is an object of class \code{"poisson_nmf_fit"} or -#' \code{"multinom_topic_model_fit"}. If two or more fits shares the +#' \code{"multinom_topic_model_fit"}. If two or more fits share the #' same rank, or number of topics, the largest log-likelihood is #' plotted. #' @@ -210,21 +220,17 @@ create_progress_plot <- function (pdat, x, y, add.point.every, colors, #' plot_loglik_vs_rank <- function (fits, ggplot_call = loglik_vs_rank_ggplot_call) { - msg <- paste("Input argument \"fits\" should be a list of length 2 or more ", - "in which each list element is an object of class", - "\"poisson_nmf_fit\" or \"multinom_topic_model_fit\"") + msg <- paste("Input argument \"fits\" should be a list of length 2 or more", + "in which all list elements are Poisson NMF fits or all", + "multinomial topic model fits") if (!(is.list(fits) & length(fits) > 1)) stop(msg) - if (!all(sapply(fits,function (x) - inherits(x,"poisson_nmf_fit") | - inherits(x,"multinom_topic_model_fit")))) + if (!(all(sapply(fits,function (x) inherits(x,"poisson_nmf_fit"))) | + all(sapply(fits,function (x) inherits(x,"multinom_topic_model_fit"))))) stop(msg) n <- length(fits) names(fits) <- paste0("fit",1:n) - for (i in 1:n) - if (inherits(fits[[i]],"multinom_topic_model_fit")) - fits[[i]] <- multinom2poisson(fits[[i]]) - dat <- compare_poisson_nmf_fits(fits)[c("k","loglik.diff")] + dat <- compare_fits(fits)[c("k","loglik.diff")] dat$k <- factor(dat$k) y <- tapply(dat$loglik.diff,dat$k,max) dat <- data.frame(x = as.numeric(names(y)),y = y) diff --git a/R/summary.R b/R/summary.R index 17553ed6..95bf5e73 100644 --- a/R/summary.R +++ b/R/summary.R @@ -14,7 +14,7 @@ #' @return The functions \code{summary.poisson_nmf_fit} and #' \code{summary.multinom_topic_model_fit} compute and return a list #' of statistics summarizing the model fit. The returned list -#' includes the following elements: +#' includes some or all of the following elements: #' #' \item{n}{The number of rows in the counts matrix, typically the #' number of samples.} @@ -31,16 +31,17 @@ #' \item{numiter}{The number of loadings and/or factor updates #' performed.} #' -#' \item{loglik}{The log-likelihood attained by the Poisson NMF model -#' fit.} +#' \item{loglik}{The Poisson NMF log-likelihood.} #' -#' \item{dev}{The deviance attained by the Poisson NMF model fit.} +#' \item{loglik.multinom}{The multinomial topic model log-likelihood.} +#' +#' \item{dev}{The Poisson NMF deviance.} #' #' \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) #' first-order optimality conditions. This can be used to assess #' convergence of the updates to a (local) solution.} #' -#' \item{mixprop}{Matrix giving a high-level summary of the +#' \item{mixprops}{Matrix giving a high-level summary of the #' mixture proportions, in which rows correspond to topics, and #' columns are ranges of mixture proportionss.} #' @@ -52,7 +53,7 @@ #' @export #' summary.poisson_nmf_fit <- function (object, ...) { - out <- summary.multinom_topic_model_fit(poisson2multinom(object)) + out <- summary.multinom_topic_model_fit(poisson2multinom(object),...) class(out) <- c("summary.poisson_nmf_fit","list") return(out) } @@ -74,9 +75,10 @@ summary.multinom_topic_model_fit <- function (object, ...) { s = object$s, numiter = numiter, loglik = object$progress[numiter,"loglik"], + loglik.multinom = object$progress[numiter,"loglik.multinom"], dev = object$progress[numiter,"dev"], res = object$progress[numiter,"res"], - mixprop = summarize_mixprops(object$L), + mixprops = summarize_mixprops(object$L), topic.reps = get_topic_reps(object$L)) class(out) <- c("summary.multinom_topic_model_fit","list") return(out) @@ -87,9 +89,6 @@ summary.multinom_topic_model_fit <- function (object, ...) { #' @param x An object of class \dQuote{summary.poisson_nmf_fit}, #' usually a result of a call to \code{summary.poisson_nmf_fit}. #' -#' @param show.size.factors If \code{TRUE}, print a summary of the -#' size factors. -#' #' @param show.mixprops If \code{TRUE}, print a summary of the mixture #' proportions. #' @@ -103,15 +102,18 @@ summary.multinom_topic_model_fit <- function (object, ...) { #' #' @export #' -print.summary.poisson_nmf_fit <- - function (x, show.size.factors = FALSE, show.mixprops = FALSE, - show.topic.reps = FALSE, ...) { - print.summary.multinom_topic_model_fit(x,...) +print.summary.poisson_nmf_fit <- function (x, show.mixprops = FALSE, + show.topic.reps = FALSE, ...) { + print.summary.multinom_topic_model_fit(x,FALSE,show.mixprops, + show.topic.reps,...) return(invisible(x)) } #' @rdname summary.poisson_nmf_fit #' +#' @param show.size.factors If \code{TRUE}, print a summary of the +#' size factors. +#' #' @method print summary.multinom_topic_model_fit #' #' @importFrom stats quantile @@ -126,12 +128,14 @@ print.summary.multinom_topic_model_fit <- cat(sprintf(" Number of data rows, n: %d\n",x$n)) cat(sprintf(" Number of data cols, m: %d\n",x$m)) cat(sprintf(" Rank/number of topics, k: %d\n",x$k)) - cat(sprintf("Evaluation of Poisson NMF fit (%d updates performed):\n", + cat(sprintf("Evaluation of model fit (%d updates performed):\n", x$numiter)) cat(sprintf(" Poisson NMF log-likelihood: %+0.12e\n",x$loglik)) + cat(sprintf(" Multinomial topic model log-likelihood: %0.12e\n", + x$loglik.multinom)) cat(sprintf(" Poisson NMF deviance: %+0.12e\n",x$dev)) cat(sprintf(" Max KKT residual: %+0.6e\n",x$res)) - if (!show.size.factors & !show.mixprops & !show.topic.reps) + if (!(show.size.factors | show.mixprops | show.topic.reps)) message("Set show.size.factors = TRUE, show.mixprops = TRUE and/or ", "show.topic.reps = TRUE in print(...) to show more informataion") if (show.size.factors & !is.null(x$s)) { @@ -151,24 +155,27 @@ print.summary.multinom_topic_model_fit <- return(invisible(x)) } -#' @title Summarize and Compare Poisson NMF Model Fits +#' @title Summarize and Compare Model Fits #' #' @description Create a table summarizing the results of fitting one -#' or more Poisson non-negative matrix factorizations. +#' or more Poisson non-negative matrix factorizations or multinomial +#' topic models. #' -#' @param fits An object of class \code{"poisson_nmf_fit"}, or a -#' non-empty, named list in which each list element is an object of -#' class \code{"poisson_nmf_fit"}. +#' @param fits An object of class \code{"poisson_nmf_fit"} or +#' \code{"multinom_topic_model_fit"}, or a non-empty, named list in +#' which all list elements are Poisson NMF model fits or all +#' multinomial topic model fits. #' #' @return A data frame with one row per element of \code{fits}, and #' with the following columns: #' #' \item{k}{The rank of the matrix factorization.} #' -#' \item{loglik}{The log-likelihood achieved at the last model fitting -#' update.} +#' \item{loglik}{The log-likelihood (either Poisson NMF or multinomial topic +#' model likelihood) achieved at the last model fitting update.} #' -#' \item{dev}{The deviance achieved at the last model fitting update.} +#' \item{dev}{For Poisson NMF model fits only, the deviance achieved +#' at the last model fitting update.} #' #' \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) #' system achieved at the last model fitting update; small values @@ -198,10 +205,11 @@ print.summary.multinom_topic_model_fit <- #' #' @export #' -compare_poisson_nmf_fits <- function (fits) { +compare_fits <- function (fits) { # Check and process input "fits". It should either be an object of - # class poisson_nmf_fit, or a list of poisson_nmf_fit objects. + # class poisson_nmf_fit or multinom_topic_model_fit, or a list of + # fit objects. if (inherits(fits,"poisson_nmf_fit")) { verify.fit(fits) fit.name <- deparse(substitute(fits)) @@ -210,11 +218,13 @@ compare_poisson_nmf_fits <- function (fits) { } else { msg <- paste("Input argument \"fits\" should either be an object of", "class \"poisson_nmf_fit\", or a non-empty, named list in", - "which each list element is an object of class", - "\"poisson_nmf_fit\"") + "which all list elements are objects of class", + "\"poisson_nmf_fit\" or all objects of class", + "\"multinom_topic_model_fit\"") if (!(is.list(fits) & !is.null(names(fits)) & length(fits) > 0)) stop(msg) - if (!all(sapply(fits,function (x) inherits(x,"poisson_nmf_fit")))) + if (!(all(sapply(fits,function(x)inherits(x,"poisson_nmf_fit"))) | + all(sapply(fits,function(x)inherits(x,"multinom_topic_model_fit"))))) stop(msg) if (!all(nchar(names(fits)) > 0)) stop(msg) @@ -241,8 +251,9 @@ compare_poisson_nmf_fits <- function (fits) { fit <- fits[[i]] x <- tail(fit$progress,n = 1) out[i,"k"] <- ncol(fit$F) - out[i,"loglik"] <- x$loglik - out[i,"dev"] <- x$dev + out[i,"loglik"] <- ifelse(inherits(fit,"poisson_nmf_fit"),x$loglik, + x$loglik.multinom) + out[i,"dev"] <- ifelse(inherits(fit,"poisson_nmf_fit"),x$dev,NA) out[i,"res"] <- x$res out[i,"nonzeros.f"] <- x$nonzeros.f out[i,"nonzeros.l"] <- x$nonzeros.l diff --git a/R/verify_args.R b/R/verify_args.R index 650452da..12a4b674 100644 --- a/R/verify_args.R +++ b/R/verify_args.R @@ -33,8 +33,8 @@ verify.count.matrix <- function (x, arg.name = deparse(substitute(x))) { return(TRUE) } -# Verify that x is a valid multinomial topic model fit or non-negative -# matrix factorization. +# Verify that x is a valid multinomial topic model fit or Poisson +# non-negative matrix factorization. verify.fit <- function (x, arg.name = deparse(substitute(x))) { arg.name.F <- paste0(arg.name,"$F") arg.name.L <- paste0(arg.name,"$L") diff --git a/TODO.txt b/TODO.txt index 9a676272..7eda0a46 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,6 +1,8 @@ to do ===== ++ Update pkgdown site. + + Remove files in inst/derivations/diffcount. + Implement "t" S3 method to transpose the rows and columns of a diff --git a/man/compare_poisson_nmf_fits.Rd b/man/compare_fits.Rd similarity index 66% rename from man/compare_poisson_nmf_fits.Rd rename to man/compare_fits.Rd index d53d6af0..a81b5705 100644 --- a/man/compare_poisson_nmf_fits.Rd +++ b/man/compare_fits.Rd @@ -1,15 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/summary.R -\name{compare_poisson_nmf_fits} -\alias{compare_poisson_nmf_fits} -\title{Summarize and Compare Poisson NMF Model Fits} +\name{compare_fits} +\alias{compare_fits} +\title{Summarize and Compare Model Fits} \usage{ -compare_poisson_nmf_fits(fits) +compare_fits(fits) } \arguments{ -\item{fits}{An object of class \code{"poisson_nmf_fit"}, or a -non-empty, named list in which each list element is an object of -class \code{"poisson_nmf_fit"}.} +\item{fits}{An object of class \code{"poisson_nmf_fit"} or +\code{"multinom_topic_model_fit"}, or a non-empty, named list in +which all list elements are Poisson NMF model fits or all +multinomial topic model fits.} } \value{ A data frame with one row per element of \code{fits}, and @@ -17,10 +18,11 @@ with the following columns: \item{k}{The rank of the matrix factorization.} -\item{loglik}{The log-likelihood achieved at the last model fitting - update.} +\item{loglik}{The log-likelihood (either Poisson NMF or multinomial topic + model likelihood) achieved at the last model fitting update.} -\item{dev}{The deviance achieved at the last model fitting update.} +\item{dev}{For Poisson NMF model fits only, the deviance achieved + at the last model fitting update.} \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) system achieved at the last model fitting update; small values @@ -47,7 +49,8 @@ with the following columns: } \description{ Create a table summarizing the results of fitting one - or more Poisson non-negative matrix factorizations. + or more Poisson non-negative matrix factorizations or multinomial + topic models. } \seealso{ \code{\link{fit_poisson_nmf}}, diff --git a/man/fit_poisson_nmf.Rd b/man/fit_poisson_nmf.Rd index c03616b6..bac7df9d 100644 --- a/man/fit_poisson_nmf.Rd +++ b/man/fit_poisson_nmf.Rd @@ -168,21 +168,21 @@ with the following elements: \item{progress}{A data frame containing detailed information about the algorithm's progress. The data frame should have \code{numiter} rows. The columns of the data frame are: "iter", the iteration - number; "loglik", the (Poisson) log-likelihood at the current best - factor and loading estimates; "dev", the deviance at the current - best factor and loading estimates; "res", the maximum residual of - the Karush-Kuhn-Tucker (KKT) first-order optimality conditions at - the current best factor and loading estimates; "loglik.multinom", - the multinomial log-likelihood at the current best factor and - loading estimates; "delta.f", the largest change in the factors - matrix; "delta.l", the largest change in the loadings matrix; - "nonzeros.f", the proportion of entries in the factors matrix that - are nonzero; "nonzeros.l", the proportion of entries in the - loadings matrix that are nonzero; "extrapolate", which is 1 if - extrapolation is used, otherwise it is 0; "beta", the setting of - the extrapolation parameter; "betamax", the setting of the - extrapolation parameter upper bound; and "timing", the elapsed time - in seconds (recorded using \code{\link{proc.time}}).} + number; "loglik", the Poisson NMF log-likelihood at the current + best factor and loading estimates; "dev", the deviance at the + current best factor and loading estimates; "res", the maximum + residual of the Karush-Kuhn-Tucker (KKT) first-order optimality + conditions at the current best factor and loading estimates; + "loglik.multinom", the multinomial topic model log-likelihood at + the current best factor and loading estimates; "delta.f", the + largest change in the factors matrix; "delta.l", the largest change + in the loadings matrix; "nonzeros.f", the proportion of entries in + the factors matrix that are nonzero; "nonzeros.l", the proportion + of entries in the loadings matrix that are nonzero; "extrapolate", + which is 1 if extrapolation is used, otherwise it is 0; "beta", the + setting of the extrapolation parameter; "betamax", the setting of + the extrapolation parameter upper bound; and "timing", the elapsed + time in seconds (recorded using \code{\link{proc.time}}).} } \description{ Approximate the input matrix \code{X} by the @@ -335,9 +335,9 @@ fit_scd <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "scd", # Compare the two fits. fits <- list(em = fit_em,scd = fit_scd) -compare_poisson_nmf_fits(fits) -plot_progress_poisson_nmf(fits,y = "loglik") -plot_progress_poisson_nmf(fits,y = "res") +compare_fits(fits) +plot_progress(fits,y = "loglik") +plot_progress(fits,y = "res") # Recover the topic model. After this step, the L matrix contains the # mixture proportions ("loadings"), and the F matrix contains the diff --git a/man/likelihood.Rd b/man/likelihood.Rd index 89351575..cde80005 100644 --- a/man/likelihood.Rd +++ b/man/likelihood.Rd @@ -33,18 +33,18 @@ inaccuracy in the computation.} \item{B}{The k x m matrix of factors. It should be a dense matrix.} -\item{family}{If \code{model = "poisson"}, the loss function -corresponding to the Poisson non-negative matrix factorization is -computed; if \code{model = "multinom"}, multinomial topic model -loss function values are returned. See "Value" for details.} +\item{family}{If \code{model = "poisson"}, the loss function values +corresponding to the Poisson non-negative matrix factorization are +computed; if \code{model = "multinom"}, the multinomial topic model +loss function values are returned.} \item{version}{When \code{version == "R"}, the computations are - performed entirely in R; when \code{version == "Rcpp"}, an Rcpp +performed entirely in R; when \code{version == "Rcpp"}, an Rcpp implementation is used. The R version is typically faster when - \code{X} is a dense matrix, whereas the Rcpp version is faster and - more memory-efficient when \code{X} is a large, sparse matrix. When - not specified, the most suitable version is called depending on - whether \code{X} is dense or sparse.} +\code{X} is a dense matrix, whereas the Rcpp version is faster and +more memory-efficient when \code{X} is a large, sparse matrix. When +not specified, the most suitable version is called depending on +whether \code{X} is dense or sparse.} } \value{ A numeric vector with one entry per row of \code{X}. @@ -54,9 +54,10 @@ Compute log-likelihoods and deviances for assessing fit of a topic model or a non-negative matrix factorization (NMF). } \details{ -Function \code{cost} is mainly for internal use to quickly - compute log-likelihoods and deviances; it should not be used - directly unless you know what you are doing. In particular, very +Function \code{cost} computes loss functions proportional + to the negative log-likelihoods, and is mainly for internal use to + quickly compute log-likelihoods and deviances; it should not be + used directly unless you know what you are doing. In particular, little argument checking is performed by \code{cost}. } \examples{ diff --git a/man/plot_loglik_vs_rank.Rd b/man/plot_loglik_vs_rank.Rd index f12a2372..c9e8e92d 100644 --- a/man/plot_loglik_vs_rank.Rd +++ b/man/plot_loglik_vs_rank.Rd @@ -3,7 +3,7 @@ \name{plot_loglik_vs_rank} \alias{plot_loglik_vs_rank} \alias{loglik_vs_rank_ggplot_call} -\title{Plot Log-Likelihood Versus Poisson NMF Rank} +\title{Plot Log-Likelihood Versus Rank} \usage{ plot_loglik_vs_rank(fits, ggplot_call = loglik_vs_rank_ggplot_call) @@ -12,7 +12,7 @@ loglik_vs_rank_ggplot_call(dat, font.size = 9) \arguments{ \item{fits}{A list with 2 more list elements, in which each list element is an object of class \code{"poisson_nmf_fit"} or -\code{"multinom_topic_model_fit"}. If two or more fits shares the +\code{"multinom_topic_model_fit"}. If two or more fits share the same rank, or number of topics, the largest log-likelihood is plotted.} @@ -30,7 +30,7 @@ customize the appearance of the plot.} A \code{ggplot} object. } \description{ -Create a plot showing the improvement in the Poisson - NMF log-likelihood as the rank of the matrix factorization or the +Create a plot showing the improvement in the + log-likelihood as the rank of the matrix factorization or the number of topics (\dQuote{k}) increases. } diff --git a/man/plot_progress_poisson_nmf.Rd b/man/plot_progress.Rd similarity index 71% rename from man/plot_progress_poisson_nmf.Rd rename to man/plot_progress.Rd index 7dad5cde..add58851 100644 --- a/man/plot_progress_poisson_nmf.Rd +++ b/man/plot_progress.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plots.R -\name{plot_progress_poisson_nmf} -\alias{plot_progress_poisson_nmf} -\title{Plot Progress of Poisson NMF Optimization Over Time} +\name{plot_progress} +\alias{plot_progress} +\title{Plot Progress of Model Fitting Over Time} \usage{ -plot_progress_poisson_nmf( +plot_progress( fits, x = c("timing", "iter"), y = c("loglik", "dev", "res"), @@ -22,10 +22,9 @@ plot_progress_poisson_nmf( \arguments{ \item{fits}{An object of class \code{"poisson_nmf_fit"} or \code{"multinom_topic_model_fit"}, or a non-empty, named list in -which each list element is an object of class -\code{"poisson_nmf_fit"} or \code{"multinom_topic_model_fit"}. -Multinomial topic model fits are automatically converted to -equivalent Poisson NMF fits using \code{\link{multinom2poisson}}.} +which each all list elements are objects of class +\code{"poisson_nmf_fit"} or all objects of class +\code{"multinom_topic_model_fit"}.} \item{x}{Choose \code{"timing"} to plot improvement in the solution over time, or choose \code{"iter"} to plot improvement in the @@ -33,8 +32,10 @@ solution per iteration.} \item{y}{Column of the "progress" data frame used to assess progress of the Poisson NMF optimization method(s). Should be one -of \code{"loglik"} (log-likelihood), \code{"dev"} (deviance) or -\code{"res"} (maximum residual of KKT conditions).} +of \code{"loglik"} (Poisson NMF or multinomial topic model +log-likelihood), \code{"dev"} (deviance) or \code{"res"} (maximum +residual of KKT conditions). The deviance is only valid for Poisson +NMF model fits.} \item{add.point.every}{A positive integer giving the iteration interval for drawing points on the progress curves. Set to @@ -75,16 +76,18 @@ A \code{ggplot} object. } \description{ Create a plot showing improvement in one or more - Poisson NMF model fits over time. The horizontal axis shows the - recorded runtime (in s), and the vertical axis shows some quantity - measuring the quality of the fit: the log-likelihood, deviance or - maximum residual of the Karush-Kuhn-Tucker (KKT) first-order - optimality conditions. To better visualize log-likelihoods and - deviances, the log-likelihood and deviance differences are shown on - the logarithmic scale. where the differences are calculated with - respect to the best value achieved over all the fits compared. + Poisson NMF or multinomial topic model fits over time. } \details{ +The horizontal axis shows the recorded runtime (in s), and +the vertical axis shows some quantity measuring the quality of the +fit: the log-likelihood, deviance or maximum residual of the +Karush-Kuhn-Tucker (KKT) first-order optimality conditions. To +better visualize log-likelihoods and deviances, log-likelihood and +deviance differences are shown on the logarithmic scale. +Differences are calculated with respect to the best value achieved +over all the fits compared. + Note that only minimal argument checking is performed. } \seealso{ diff --git a/man/summary.poisson_nmf_fit.Rd b/man/summary.poisson_nmf_fit.Rd index e8cd04ae..c0363dfa 100644 --- a/man/summary.poisson_nmf_fit.Rd +++ b/man/summary.poisson_nmf_fit.Rd @@ -11,13 +11,7 @@ \method{summary}{multinom_topic_model_fit}(object, ...) -\method{print}{summary.poisson_nmf_fit}( - x, - show.size.factors = FALSE, - show.mixprops = FALSE, - show.topic.reps = FALSE, - ... -) +\method{print}{summary.poisson_nmf_fit}(x, show.mixprops = FALSE, show.topic.reps = FALSE, ...) \method{print}{summary.multinom_topic_model_fit}( x, @@ -40,20 +34,20 @@ or \code{print.summary} method.} \item{x}{An object of class \dQuote{summary.poisson_nmf_fit}, usually a result of a call to \code{summary.poisson_nmf_fit}.} -\item{show.size.factors}{If \code{TRUE}, print a summary of the -size factors.} - \item{show.mixprops}{If \code{TRUE}, print a summary of the mixture proportions.} \item{show.topic.reps}{If \code{TRUE}, print a summary of the topic representatives.} + +\item{show.size.factors}{If \code{TRUE}, print a summary of the +size factors.} } \value{ The functions \code{summary.poisson_nmf_fit} and \code{summary.multinom_topic_model_fit} compute and return a list of statistics summarizing the model fit. The returned list -includes the following elements: +includes some or all of the following elements: \item{n}{The number of rows in the counts matrix, typically the number of samples.} @@ -70,16 +64,17 @@ includes the following elements: \item{numiter}{The number of loadings and/or factor updates performed.} -\item{loglik}{The log-likelihood attained by the Poisson NMF model - fit.} +\item{loglik}{The Poisson NMF log-likelihood.} + +\item{loglik.multinom}{The multinomial topic model log-likelihood.} -\item{dev}{The deviance attained by the Poisson NMF model fit.} +\item{dev}{The Poisson NMF deviance.} \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT) first-order optimality conditions. This can be used to assess convergence of the updates to a (local) solution.} -\item{mixprop}{Matrix giving a high-level summary of the +\item{mixprops}{Matrix giving a high-level summary of the mixture proportions, in which rows correspond to topics, and columns are ranges of mixture proportionss.} diff --git a/tests/testthat/test_fit_poisson_nmf.R b/tests/testthat/test_fit_poisson_nmf.R index 88516b5f..cd65548d 100644 --- a/tests/testthat/test_fit_poisson_nmf.R +++ b/tests/testthat/test_fit_poisson_nmf.R @@ -275,7 +275,7 @@ test_that(paste("When initialized \"close enough\" to a stationary point, the", # Check that both algorithms recover the same solution. fits <- list(fit1 = fit1,fit2 = fit2) - dat <- compare_poisson_nmf_fits(fits) + dat <- compare_fits(fits) expect_equal(dat["fit1","dev"],dat["fit2","dev"],tolerance = 0.01,scale = 1) expect_equal(dat["fit1","loglik"],dat["fit2","loglik"], tolerance = 0.01,scale = 1) @@ -338,7 +338,7 @@ test_that(paste("All Poisson NMF updates recover same solution when", # All three solution estimates should have nearly the same likelihood # and deviance. fits <- list(fit1 = fit1,fit2 = fit2) - dat <- compare_poisson_nmf_fits(fits) + dat <- compare_fits(fits) expect_equal(dat["fit1","loglik"],dat["fit2","loglik"], tolerance = 0.01,scale = 1) expect_equal(dat["fit1","dev"],dat["fit2","dev"],tolerance = 0.01,scale = 1) @@ -396,7 +396,7 @@ test_that(paste("Extrapolated updates achieve solutions that are as good", # higher (or at least not lower) than the non-extrapolated updates. fits <- list(fit1 = fit1,fit2 = fit2,fit3 = fit3,fit4 = fit4, fit5 = fit5,fit6 = fit6,fit7 = fit7,fit8 = fit8) - dat <- compare_poisson_nmf_fits(fits) + dat <- compare_fits(fits) expect_lte(dat["fit1","loglik"],dat["fit5","loglik"]) expect_lte(dat["fit2","loglik"],dat["fit6","loglik"]) expect_lte(dat["fit3","loglik"],dat["fit7","loglik"]) diff --git a/tests/testthat/test_plots.R b/tests/testthat/test_plots.R index ee87a2d5..6867f59a 100644 --- a/tests/testthat/test_plots.R +++ b/tests/testthat/test_plots.R @@ -12,8 +12,11 @@ test_that("Test that plot_loglik_vs_rank works",{ capture.output(fit10 <- fit_poisson_nmf(X,k = 10,numiter = 100)) # Plot log-likelihood vs. rank. - p <- plot_loglik_vs_rank(list(fit2,fit3,fit5,fit10)) - expect_s3_class(p,"ggplot") + p1 <- plot_loglik_vs_rank(list(fit2,fit3,fit5,fit10)) + p2 <- plot_loglik_vs_rank(lapply(list(fit2,fit3,fit5,fit10), + poisson2multinom)) + expect_s3_class(p1,"ggplot") + expect_s3_class(p2,"ggplot") }) test_that("Test that pca_plot and pca_hexbin_plot work",{ @@ -57,8 +60,11 @@ test_that("Test that other plotting functions work",{ fit2 <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "em", control = list(extrapolate = TRUE))) - # Test plot_progress_poisson_nmf. - plot_progress_poisson_nmf(list(scd = fit1,em = fit2)) + # Test plot_progress. + plot_progress(list(scd = fit1,em = fit2),y = "loglik") + plot_progress(list(scd = fit1,em = fit2),y = "dev") + plot_progress(list(scd = fit1,em = fit2),y = "res") + plot_progress(list(scd = poisson2multinom(fit1),em = poisson2multinom(fit2))) # Test loadings_plot. x <- factor(sample(1:4,80,replace = TRUE)) diff --git a/tests/testthat/test_summary.R b/tests/testthat/test_summary.R index 1eee66b7..8b171b20 100644 --- a/tests/testthat/test_summary.R +++ b/tests/testthat/test_summary.R @@ -10,6 +10,13 @@ test_that("summary method and print.summary methods produce output",{ # Fit a Poisson non-negative factorization. capture.output(fit <- fit_poisson_nmf(X,k = 3,numiter = 100)) - # Produce a summary of the model fit. + # Produce summaries of the model fit. expect_output(print(summary(fit))) + expect_output(print(summary(poisson2multinom(fit)))) + expect_output(print(summary(fit),show.mixprops = TRUE, + show.topic.reps = TRUE)) + expect_output(print(summary(poisson2multinom(fit)), + show.size.factors = TRUE, + show.mixprops = TRUE, + show.topic.reps = TRUE)) }) diff --git a/vignettes/single_cell_rnaseq_practical.Rmd b/vignettes/single_cell_rnaseq_practical.Rmd index 83113703..847dfe3c 100644 --- a/vignettes/single_cell_rnaseq_practical.Rmd +++ b/vignettes/single_cell_rnaseq_practical.Rmd @@ -58,8 +58,7 @@ fitting has converged to a maximum-likelihood solution. This can be easily done in **fastTopics**: ```{r plot-loglik, fig.height=2, fig.width=4} -plot_progress_poisson_nmf(fit,x = "iter",add.point.every = 10, - colors = "black") + +plot_progress(fit,x = "iter",add.point.every = 10,colors = "black") + theme_cowplot(font_size = 10) ```