Skip to content

Commit

Permalink
add singular value plots to visualize OHT
Browse files Browse the repository at this point in the history
  • Loading branch information
andrearaithel committed Jan 5, 2025
1 parent 3ff7461 commit 5255a94
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 75 deletions.
18 changes: 9 additions & 9 deletions R/find_encoding_dimensions.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,17 +183,17 @@ estimateBestQ <- function(fds, type="jaccard", useOHT=TRUE, implementation="PCA"
}
cat("Optimal encoding dimension:", latentDim, "\n")

data <- data.table(
q=latentDim,
oht=cutoff
)
data <- data.table(singular_values=sv)
data <- data[, q:=.I]
data <- data[-.N] # remove last entry (close to zero, impedes visualization)
data <- data[, oht:=FALSE]
data <- data[q==latentDim, oht:=TRUE] # set optimal latent dimension
data <- data[order(-oht)]

hyperParams(fds, type=type) <- data
#TODO: integrate singular value plot into plotEncDimSearch function
#if(isTRUE(plot)){
# print(plotEncDimSearch(fds, type=type, plotType="sv"))
#}

if(isTRUE(plot)){
print(plotEncDimSearch(fds, type=type, plotType="sv"))
}
return(fds)
}

Expand Down
10 changes: 8 additions & 2 deletions R/getNSetterFuns.R
Original file line number Diff line number Diff line change
Expand Up @@ -529,8 +529,14 @@ hyperParams <- function(fds, type=currentType(fds), all=FALSE){
if(is.null(ans)){
return(ans)
}
if(isFALSE(all) && "aroc" %in% names(ans)){
ans <- ans[aroc == max(aroc)][1]
if(isFALSE(all)){
if ("aroc" %in% names(ans)){
ans <- ans[aroc == max(aroc)][1]
}
else{
ans <- ans[oht == TRUE][1]
}

}
ans
}
Expand Down
126 changes: 77 additions & 49 deletions R/plotMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)


plotEncDimSearch.FRASER <- function(object, type=psiTypes,
plotType=c("auc", "loss")){
plotType=c("auc", "loss", "sv")){
type <- match.arg(type)
plotType <- match.arg(plotType)
data <- hyperParams(object, type=type, all=TRUE)
Expand All @@ -1017,59 +1017,87 @@ plotEncDimSearch.FRASER <- function(object, type=psiTypes,
"\nPlease use `estimateBestQ` to compute them."))
return(NULL)
}
else if ("oht" %in% names(data)) {
warning(paste("OHT was used to estimate the optimal encoding dimension.",
"A visualization of the method will be included soon.",
collapse="\n"))
return(NULL)
}
if(!"nsubset" %in% colnames(data)){
data[,nsubset:="NA"]
}
data[,noise:=as.factor(noise)]
data[,nsubset:=as.factor(nsubset)]

data[,isOptimalQ:= q == .SD[aroc == max(aroc), q], by="nsubset,noise"]
if (plotType == "sv") {
if (type!="jaccard"){
warning(paste("A singular value plot can only be generated for the",
"Intron Jaccard Indices.",
collapse="\n"))
return(NULL)}
else if (!("oht" %in% names(data))) {
warning(paste("Please use `estimateBestQ` to compute the singular values",
"using OHT before visualizing them.",
collapse="\n"))
return(NULL)}

opt_data <- data[oht==TRUE,]

g0 <- ggplot(data, aes(q, singular_values)) +
geom_line(color="blue") +
geom_vline(xintercept = opt_data$q, linetype = "dashed", color = "darkgray") +
geom_text(data=opt_data, aes(x = q+3, y = singular_values,
label = q), color = "darkgray", size=4.5) +
xlab("Singular value rank (Estimated q)") +
ylab("Log(singular value)") +
theme_bw(base_size=16) +
scale_y_log10() +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]]))))))
g0
}

if(plotType == "auc"){

g1 <- ggplot(data, aes(q, aroc, col=nsubset, linetype=noise)) +
geom_point() +
geom_smooth(method="loess", formula=y~x) +
geom_vline(data=data[isOptimalQ == TRUE,],
mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
geom_text(data=data[isOptimalQ == TRUE,],
aes(y=0.0, q+1, label=q)) +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
xlab("Estimated q") +
ylab("Area under the PR curve") +
theme_bw(base_size=16)

if(data[,uniqueN(nsubset)] == 1 & data[,uniqueN(noise)] == 1){
g1 <- g1 + theme(legend.position='none')
}
else if(plotType == "auc"){
if(!"nsubset" %in% colnames(data)){
data[,nsubset:="NA"]
}
data[,noise:=as.factor(noise)]
data[,nsubset:=as.factor(nsubset)]
data[,isOptimalQ:= q == .SD[aroc == max(aroc), q], by="nsubset,noise"]

g1
g1 <- ggplot(data, aes(q, aroc, col=nsubset, linetype=noise)) +
geom_point() +
geom_smooth(method="loess", formula=y~x) +
geom_vline(data=data[isOptimalQ == TRUE,],
mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
geom_text(data=data[isOptimalQ == TRUE,],
aes(y=0.0, q+1, label=q)) +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
xlab("Estimated q") +
ylab("Area under the PR curve") +
theme_bw(base_size=16)

if(data[,uniqueN(nsubset)] == 1 & data[,uniqueN(noise)] == 1){
g1 <- g1 + theme(legend.position='none')
}

g1
}

else{

g2 <- ggplot(data, aes(q, eval, col=nsubset, linetype=noise)) +
geom_point() +
geom_smooth() +
geom_vline(data=data[isOptimalQ == TRUE,],
mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
xlab("Estimated q") +
ylab("Model loss") +
theme_bw(base_size=16)

if(data[,uniqueN(nsubset)] == 1 & data[,uniqueN(noise)] == 1){
g2 <- g2 + theme(legend.position='none')
}

g2
if(!"nsubset" %in% colnames(data)){
data[,nsubset:="NA"]
}
data[,noise:=as.factor(noise)]
data[,nsubset:=as.factor(nsubset)]
data[,isOptimalQ:= q == .SD[aroc == max(aroc), q], by="nsubset,noise"]

g2 <- ggplot(data, aes(q, eval, col=nsubset, linetype=noise)) +
geom_point() +
geom_smooth() +
geom_vline(data=data[isOptimalQ == TRUE,],
mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
ggtitle(as.expression(bquote(bold(paste(
"Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
xlab("Estimated q") +
ylab("Model loss") +
theme_bw(base_size=16)

if(data[,uniqueN(nsubset)] == 1 & data[,uniqueN(noise)] == 1){
g2 <- g2 + theme(legend.position='none')
}

g2
}

}
Expand Down
2 changes: 1 addition & 1 deletion man/plotFunctions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 2 additions & 5 deletions tests/testthat/test_hyperParams.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,10 @@ test_that("Test Optimal Hard Thresholding", {
fds <- calculatePSIValues(fds)
fds <- estimateBestQ(fds, type="jaccard", useOHT=TRUE)

expect_equal(c(1,2), dim(hyperParams(fds, type="jaccard")))
expect_equal(c(1,3), dim(hyperParams(fds, type="jaccard")))
expect_equal(metadata(fds)$hyperParams_jaccard[1,q],
bestQ(fds, type="jaccard"))
expect_warning(plotEncDimSearch(fds, "jaccard"),
paste("OHT was used to estimate the optimal encoding dimension.",
"A visualization of the method will be included soon.",
collapse="\n"))
expect_is(suppressWarnings(plotEncDimSearch(fds, "jaccard", plotType="sv")), "ggplot")
})

test_that("Test optimalSVHTCoef", {
Expand Down
16 changes: 7 additions & 9 deletions vignettes/FRASER.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -693,26 +693,24 @@ and Donoho\cite{Gavish2013}. Alternatively, a hyperparameter optimization can be
performed that works by artificially injecting outliers into the data and then
comparing the AUC of recalling these outliers for different values of $q$. Since
this hyperparameter optimization step is quite CPU-intensive, we only show it here
for a subset of the dataset and recommend to use the faster OHT approach:
for a subset of the dataset and recommend to use the faster OHT approach. The results
from the estimation of the optimal latent space dimension can be visualized with the
function \Rfunction{plotEncDimSearch}.

<<findBestQ, echo=TRUE>>=
set.seed(42)
# Optimal Hard Thresholding (default method)
fds <- estimateBestQ(fds, type="jaccard", plot=FALSE)
# hyperparameter opimization
plotEncDimSearch(fds, type="jaccard", plotType="sv")
# Hyperparameter opimization
fds <- estimateBestQ(fds, type="jaccard", useOHT=FALSE, plot=FALSE)
plotEncDimSearch(fds, type="jaccard", plotType="auc")
# retrieve the estimated optimal dimension of the latent space
bestQ(fds, type="jaccard")
@

The results from this hyper parameter optimization can be visualized with the
function \Rfunction{plotEncDimSearch}.

<<figure_findBestQ, echo=TRUE>>=
plotEncDimSearch(fds, type="jaccard")
@

\subsection{P-value calculation}
\label{sec:P-value-calculation}

Expand Down

0 comments on commit 5255a94

Please sign in to comment.