Skip to content

Commit

Permalink
Revised compare_fits and plot_progress interface so that it know allo…
Browse files Browse the repository at this point in the history
…ws for either Poisson NMF or multinomial topic model fits as input.
  • Loading branch information
pcarbo committed Mar 4, 2021
1 parent 41adac1 commit 6293480
Show file tree
Hide file tree
Showing 18 changed files with 238 additions and 204 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Encoding: UTF-8
Type: Package
Package: fastTopics
Version: 0.5-22
Version: 0.5-23
Date: 2021-03-04
Title: Fast Algorithms for Fitting Topic Models and Non-Negative
Matrix Factorizations to Count Data
Expand Down
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ S3method(print,summary.poisson_nmf_fit)
S3method(select,multinom_topic_model_fit)
S3method(summary,multinom_topic_model_fit)
S3method(summary,poisson_nmf_fit)
export(compare_poisson_nmf_fits)
export(compare_fits)
export(cost)
export(deviance_poisson_nmf)
export(diff_count_analysis)
Expand All @@ -30,7 +30,7 @@ export(pca_hexbin_plot_ggplot_call)
export(pca_plot)
export(pca_plot_ggplot_call)
export(plot_loglik_vs_rank)
export(plot_progress_poisson_nmf)
export(plot_progress)
export(poisson2multinom)
export(select_loadings)
export(simulate_count_data)
Expand Down
42 changes: 21 additions & 21 deletions R/fit_poisson_nmf.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@
#' (KKT) first-order conditions. As the iterates approach a stationary
#' point of the loss function, the change in the model parameters
#' should be small, and the residuals of the KKT system should vanish.
#' Use \code{\link{plot_progress_poisson_nmf}} to plot the improvement
#' in the solution over time.
#' Use \code{\link{plot_progress}} to plot the improvement in the
#' solution over time.
#'
#' See \code{\link{fit_topic_model}} for additional guidance on model
#' fitting, particularly for large or complex data sets.
Expand Down Expand Up @@ -238,21 +238,21 @@
#' \item{progress}{A data frame containing detailed information about
#' the algorithm's progress. The data frame should have \code{numiter}
#' rows. The columns of the data frame are: "iter", the iteration
#' number; "loglik", the (Poisson) log-likelihood at the current best
#' factor and loading estimates; "dev", the deviance at the current
#' best factor and loading estimates; "res", the maximum residual of
#' the Karush-Kuhn-Tucker (KKT) first-order optimality conditions at
#' the current best factor and loading estimates; "loglik.multinom",
#' the multinomial log-likelihood at the current best factor and
#' loading estimates; "delta.f", the largest change in the factors
#' matrix; "delta.l", the largest change in the loadings matrix;
#' "nonzeros.f", the proportion of entries in the factors matrix that
#' are nonzero; "nonzeros.l", the proportion of entries in the
#' loadings matrix that are nonzero; "extrapolate", which is 1 if
#' extrapolation is used, otherwise it is 0; "beta", the setting of
#' the extrapolation parameter; "betamax", the setting of the
#' extrapolation parameter upper bound; and "timing", the elapsed time
#' in seconds (recorded using \code{\link{proc.time}}).}
#' number; "loglik", the Poisson NMF log-likelihood at the current
#' best factor and loading estimates; "dev", the deviance at the
#' current best factor and loading estimates; "res", the maximum
#' residual of the Karush-Kuhn-Tucker (KKT) first-order optimality
#' conditions at the current best factor and loading estimates;
#' "loglik.multinom", the multinomial topic model log-likelihood at
#' the current best factor and loading estimates; "delta.f", the
#' largest change in the factors matrix; "delta.l", the largest change
#' in the loadings matrix; "nonzeros.f", the proportion of entries in
#' the factors matrix that are nonzero; "nonzeros.l", the proportion
#' of entries in the loadings matrix that are nonzero; "extrapolate",
#' which is 1 if extrapolation is used, otherwise it is 0; "beta", the
#' setting of the extrapolation parameter; "betamax", the setting of
#' the extrapolation parameter upper bound; and "timing", the elapsed
#' time in seconds (recorded using \code{\link{proc.time}}).}
#'
#' @references
#'
Expand Down Expand Up @@ -309,9 +309,9 @@
#'
#' # Compare the two fits.
#' fits <- list(em = fit_em,scd = fit_scd)
#' compare_poisson_nmf_fits(fits)
#' plot_progress_poisson_nmf(fits,y = "loglik")
#' plot_progress_poisson_nmf(fits,y = "res")
#' compare_fits(fits)
#' plot_progress(fits,y = "loglik")
#' plot_progress(fits,y = "res")
#'
#' # Recover the topic model. After this step, the L matrix contains the
#' # mixture proportions ("loadings"), and the F matrix contains the
Expand Down Expand Up @@ -422,7 +422,7 @@ fit_poisson_nmf <- function (X, k, fit0, numiter = 100,
method.text <- "CCD"
cat(sprintf("Running %d %s updates, %s extrapolation ",numiter,
method.text,ifelse(control$extrapolate,"with","without")))
cat("(fastTopics 0.5-22).\n")
cat("(fastTopics 0.5-23).\n")
}

# INITIALIZE ESTIMATES
Expand Down
27 changes: 14 additions & 13 deletions R/likelihood.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
#' @description Compute log-likelihoods and deviances for assessing
#' fit of a topic model or a non-negative matrix factorization (NMF).
#'
#' @details Function \code{cost} is mainly for internal use to quickly
#' compute log-likelihoods and deviances; it should not be used
#' directly unless you know what you are doing. In particular, very
#' @details Function \code{cost} computes loss functions proportional
#' to the negative log-likelihoods, and is mainly for internal use to
#' quickly compute log-likelihoods and deviances; it should not be
#' used directly unless you know what you are doing. In particular,
#' little argument checking is performed by \code{cost}.
#'
#' @param X The n x m matrix of counts or pseudocounts. It can be a
Expand All @@ -16,7 +17,7 @@
#'
#' @param fit A Poisson NMF or multinomial topic model fit, such as an
#' output from \code{\link{fit_poisson_nmf}} or
#' \code{\link{fit_topic_model}}.
#' \code{\link{fit_topic_model}}.
#'
#' @param A The n x k matrix of loadings. It should be a dense matrix.
#'
Expand All @@ -27,14 +28,14 @@
#' numerical problems at the cost of introducing a very small
#' inaccuracy in the computation.
#'
#' @param family If \code{model = "poisson"}, the loss function
#' corresponding to the Poisson non-negative matrix factorization is
#' computed; if \code{model = "multinom"}, multinomial topic model
#' loss function values are returned. See "Value" for details.
#' @param family If \code{model = "poisson"}, the loss function values
#' corresponding to the Poisson non-negative matrix factorization are
#' computed; if \code{model = "multinom"}, the multinomial topic model
#' loss function values are returned.
#'
#' @param version When \code{version == "R"}, the computations are
#' performed entirely in R; when \code{version == "Rcpp"}, an Rcpp
#' implementation is used. The R version is typically faster when
#' implementation is used. The R version is typically faster when
#' \code{X} is a dense matrix, whereas the Rcpp version is faster and
#' more memory-efficient when \code{X} is a large, sparse matrix. When
#' not specified, the most suitable version is called depending on
Expand Down Expand Up @@ -194,10 +195,10 @@ poisson_nmf_kkt <- function (X, F, L, e = 1e-8) {
}

# Given a Poisson non-negative matrix factorization (F, L), compute
# the log-likelihoods for the "size factors"; that is, the
# log-likelihood for t ~ Poisson(s), where t = sum(x) is the total sum
# of the counts. The size factors, s, are recovered from the
# poisson2multinom transformation.
# the log-likelihoods for the "size factors"; that is, each vetor
# element is a log-likelihood for the model t ~ Poisson(s), where
# t = sum(x) is the total sum of the counts. The size factors, s, are
# recovered from the poisson2multinom transformation.
#
#' @importFrom stats dpois
loglik_size_factors <- function (X, F, L)
Expand Down
98 changes: 52 additions & 46 deletions R/plots.R
Original file line number Diff line number Diff line change
@@ -1,32 +1,35 @@
#' @title Plot Progress of Poisson NMF Optimization Over Time
#' @title Plot Progress of Model Fitting Over Time
#'
#' @description Create a plot showing improvement in one or more
#' Poisson NMF model fits over time. The horizontal axis shows the
#' recorded runtime (in s), and the vertical axis shows some quantity
#' measuring the quality of the fit: the log-likelihood, deviance or
#' maximum residual of the Karush-Kuhn-Tucker (KKT) first-order
#' optimality conditions. To better visualize log-likelihoods and
#' deviances, the log-likelihood and deviance differences are shown on
#' the logarithmic scale. where the differences are calculated with
#' respect to the best value achieved over all the fits compared.
#' Poisson NMF or multinomial topic model fits over time.
#'
#' @details Note that only minimal argument checking is performed.
#' @details The horizontal axis shows the recorded runtime (in s), and
#' the vertical axis shows some quantity measuring the quality of the
#' fit: the log-likelihood, deviance or maximum residual of the
#' Karush-Kuhn-Tucker (KKT) first-order optimality conditions. To
#' better visualize log-likelihoods and deviances, log-likelihood and
#' deviance differences are shown on the logarithmic scale.
#' Differences are calculated with respect to the best value achieved
#' over all the fits compared.
#'
#' Note that only minimal argument checking is performed.
#'
#' @param fits An object of class \code{"poisson_nmf_fit"} or
#' \code{"multinom_topic_model_fit"}, or a non-empty, named list in
#' which each list element is an object of class
#' \code{"poisson_nmf_fit"} or \code{"multinom_topic_model_fit"}.
#' Multinomial topic model fits are automatically converted to
#' equivalent Poisson NMF fits using \code{\link{multinom2poisson}}.
#' which each all list elements are objects of class
#' \code{"poisson_nmf_fit"} or all objects of class
#' \code{"multinom_topic_model_fit"}.
#'
#' @param x Choose \code{"timing"} to plot improvement in the solution
#' over time, or choose \code{"iter"} to plot improvement in the
#' solution per iteration.
#'
#' @param y Column of the "progress" data frame used to assess
#' progress of the Poisson NMF optimization method(s). Should be one
#' of \code{"loglik"} (log-likelihood), \code{"dev"} (deviance) or
#' \code{"res"} (maximum residual of KKT conditions).
#' of \code{"loglik"} (Poisson NMF or multinomial topic model
#' log-likelihood), \code{"dev"} (deviance) or \code{"res"} (maximum
#' residual of KKT conditions). The deviance is only valid for Poisson
#' NMF model fits.
#'
#' @param add.point.every A positive integer giving the iteration
#' interval for drawing points on the progress curves. Set to
Expand Down Expand Up @@ -81,7 +84,7 @@
#'
#' @export
#'
plot_progress_poisson_nmf <-
plot_progress <-
function (fits, x = c("timing","iter"), y = c("loglik","dev","res"),
add.point.every = 20,
colors = c("#E69F00","#56B4E9","#009E73","#F0E442","#0072B2",
Expand All @@ -99,13 +102,13 @@ plot_progress_poisson_nmf <-
} else {
msg <- paste("Input argument \"fits\" should either be an object of",
"class \"poisson_nmf_fit\" or \"multinom_topic_model_fit\"",
"or a non-empty, named list in which each list element is",
"an object of class \"poisson_nmf_fit\" or",
"or a non-empty, named list in which all list elements are",
"objects of class \"poisson_nmf_fit\" or all of class",
"\"multinom_topic_model_fit\"")
if (!(is.list(fits) & !is.null(names(fits)) & length(fits) > 0))
stop(msg)
if (!all(sapply(fits,function (x) inherits(x,"poisson_nmf_fit") |
inherits(x,"multinom_topic_model_fit"))))
if (!(all(sapply(fits,function(x)inherits(x,"poisson_nmf_fit"))) |
all(sapply(fits,function(x)inherits(x,"multinom_topic_model_fit")))))
stop(msg)
if (!all(nchar(names(fits)) > 0))
stop(msg)
Expand All @@ -114,7 +117,10 @@ plot_progress_poisson_nmf <-
# Check and process input arguments "x" and "y".
x <- match.arg(x)
y <- match.arg(y)

if (y == "dev" & !inherits(fits[[1]],"poisson_nmf_fit"))
stop("y = \"dev\" is only valid for Poisson NMF model fits")
if (y == "loglik" & inherits(fits[[1]],"multinom_topic_model_fit"))
y <- "loglik.multinom"
# Check and process input arguments "colors", "linetypes",
# "linesizes" and "shapes".
n <- length(fits)
Expand All @@ -139,8 +145,8 @@ plot_progress_poisson_nmf <-
linesizes,shapes,fills,theme))
}

# Used by plot_progress_poisson_nmf to create a data frame suitable
# for plotting with ggplot.
# Used by plot_progress to create a data frame suitable for plotting
# with ggplot.
prepare_progress_plot_data <- function (fits, e) {
n <- length(fits)
labels <- names(fits)
Expand All @@ -150,25 +156,29 @@ prepare_progress_plot_data <- function (fits, e) {
y$timing <- cumsum(y$timing)
fits[[i]] <- y
}
out <- do.call(rbind,fits)
out$method <- factor(out$method,labels)
out$loglik <- max(out$loglik) - out$loglik + e
out$dev <- out$dev - min(out$dev) + e
out <- do.call(rbind,fits)
out$method <- factor(out$method,labels)
out$loglik <- max(out$loglik) - out$loglik + e
out$loglik.multinom <- max(out$loglik.multinom) - out$loglik.multinom + e
out$dev <- out$dev - min(out$dev) + e
return(out)
}

# Used by plot_progress_poisson_nmf to create the plot.
# Used by plot_progress to create the plot.
create_progress_plot <- function (pdat, x, y, add.point.every, colors,
linetypes, linesizes, shapes, fills, theme) {
linetypes, linesizes, shapes, fills,
theme) {
rows <- which(pdat$iter %% add.point.every == 1)
if (x == "timing")
xlab <- "runtime (s)"
else if (x == "iter")
xlab <- "iteration"
if (y == "res")
ylab <- "max KKT residual"
else if (y == "loglik" | y == "dev")
ylab <- paste("distance from best",y)
else if (y == "dev")
ylab <- paste("distance from best deviance")
else if (y == "loglik" | y == "loglik.multinom")
ylab <- paste("distance from best loglik")
return(ggplot(pdat,aes_string(x = x,y = y,color = "method",
linetype = "method",size = "method")) +
geom_line(na.rm = TRUE) +
Expand All @@ -188,15 +198,15 @@ create_progress_plot <- function (pdat, x, y, add.point.every, colors,

#' @rdname plot_loglik_vs_rank
#'
#' @title Plot Log-Likelihood Versus Poisson NMF Rank
#' @title Plot Log-Likelihood Versus Rank
#'
#' @description Create a plot showing the improvement in the Poisson
#' NMF log-likelihood as the rank of the matrix factorization or the
#' @description Create a plot showing the improvement in the
#' log-likelihood as the rank of the matrix factorization or the
#' number of topics (\dQuote{k}) increases.
#'
#' @param fits A list with 2 more list elements, in which each list
#' element is an object of class \code{"poisson_nmf_fit"} or
#' \code{"multinom_topic_model_fit"}. If two or more fits shares the
#' \code{"multinom_topic_model_fit"}. If two or more fits share the
#' same rank, or number of topics, the largest log-likelihood is
#' plotted.
#'
Expand All @@ -210,21 +220,17 @@ create_progress_plot <- function (pdat, x, y, add.point.every, colors,
#'
plot_loglik_vs_rank <- function (fits,
ggplot_call = loglik_vs_rank_ggplot_call) {
msg <- paste("Input argument \"fits\" should be a list of length 2 or more ",
"in which each list element is an object of class",
"\"poisson_nmf_fit\" or \"multinom_topic_model_fit\"")
msg <- paste("Input argument \"fits\" should be a list of length 2 or more",
"in which all list elements are Poisson NMF fits or all",
"multinomial topic model fits")
if (!(is.list(fits) & length(fits) > 1))
stop(msg)
if (!all(sapply(fits,function (x)
inherits(x,"poisson_nmf_fit") |
inherits(x,"multinom_topic_model_fit"))))
if (!(all(sapply(fits,function (x) inherits(x,"poisson_nmf_fit"))) |
all(sapply(fits,function (x) inherits(x,"multinom_topic_model_fit")))))
stop(msg)
n <- length(fits)
names(fits) <- paste0("fit",1:n)
for (i in 1:n)
if (inherits(fits[[i]],"multinom_topic_model_fit"))
fits[[i]] <- multinom2poisson(fits[[i]])
dat <- compare_poisson_nmf_fits(fits)[c("k","loglik.diff")]
dat <- compare_fits(fits)[c("k","loglik.diff")]
dat$k <- factor(dat$k)
y <- tapply(dat$loglik.diff,dat$k,max)
dat <- data.frame(x = as.numeric(names(y)),y = y)
Expand Down
Loading

0 comments on commit 6293480

Please sign in to comment.