diff --git a/DESCRIPTION b/DESCRIPTION index c8f4fea..96c0a9b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Monarch Knowledge Graph Queries Description: R package for easy access, manipulation, and analysis of Monarch KG data Resources. -Version: 1.2.2 +Version: 1.4.0 URL: https://github.com/monarch-initiative/monarchr BugReports: https://github.com/monarch-initiative/monarchr/issues Authors@R: diff --git a/NAMESPACE b/NAMESPACE index fd78ef8..d2e357f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,8 @@ S3method(cypher_query,neo4j_engine) S3method(cypher_query_df,neo4j_engine) S3method(edges,tbl_kgx) +S3method(example_graph,file_engine) +S3method(example_graph,neo4j_engine) S3method(expand,tbl_kgx) S3method(explode,tbl_kgx) S3method(fetch_nodes,file_engine) @@ -12,11 +14,14 @@ S3method(knit_print,tbl_kgx) S3method(nodes,tbl_kgx) S3method(plot,tbl_kgx) S3method(summarize_neighborhood,tbl_kgx) +S3method(summary,file_engine) +S3method(summary,neo4j_engine) export("%in_list%") export("%~%") export(cypher_query) export(cypher_query_df) export(edges) +export(example_graph) export(expand) export(explode) export(fetch_nodes) @@ -55,6 +60,7 @@ importFrom(kableExtra,column_spec) importFrom(kableExtra,kable) importFrom(kableExtra,kable_styling) importFrom(neo2R,cypher) +importFrom(neo2R,multicypher) importFrom(neo2R,startGraph) importFrom(purrr,map_chr) importFrom(readr,col_character) @@ -68,6 +74,7 @@ importFrom(stringr,str_wrap) importFrom(tibble,tibble) importFrom(tidygraph,activate) importFrom(tidygraph,as_tibble) +importFrom(tidygraph,graph_join) importFrom(tidygraph,tbl_graph) importFrom(utils,capture.output) importFrom(utils,download.file) diff --git a/NEWS.md b/NEWS.md index e14b54e..03507fd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,21 @@ +# monarchr 1.4.0 +## New features + +* `example_graph()` function for engines +* Engine `summary()` now returns named lists of available node categories and edge predicates for convenient auto-completion + +# monarchr 1.3.0 + +## New features + +* `summary()` function for engines + +## Bugfixs + +* fix backend bug in Neo4j table queries not handling default params properly +* add batch queries for Neo4j backend engine +* added summary() for KG engines to summarize total node and edge count information # monarchr 1.2.2 diff --git a/R/cypher_query.R b/R/cypher_query.R index fbdaff6..f0b74fe 100644 --- a/R/cypher_query.R +++ b/R/cypher_query.R @@ -3,7 +3,7 @@ #' This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a tbl_kgx graph. #' #' @param engine A neo4j KG engine -#' @param query A string representing the Cypher query. +#' @param query A string representing the Cypher query. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a single joined graph. #' @param parameters A list of parameters for the Cypher query. Default is an empty list. #' @param ... Additional arguments passed to the function. #' @return The result of the Cypher query as a tbl_kgx graph. diff --git a/R/cypher_query.neo4j_engine.R b/R/cypher_query.neo4j_engine.R index 5b42fb5..99f2091 100644 --- a/R/cypher_query.neo4j_engine.R +++ b/R/cypher_query.neo4j_engine.R @@ -27,13 +27,15 @@ stitch_vectors <- function(x) { } } -########### Public functions ########### - -#' @export -#' @importFrom neo2R cypher -#' @importFrom tibble tibble -cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { # - res <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "graph") +#' Process neo2R cypher to tbl_kgx +#' +#' Given a result from neo2R::cypher returning KGX-formatted nodes and edges, +#' parse the result to generate a tbl_kgx object, attaching the provided engine. +#' +#' @param res The result from neo2R::cypher with result = "graph" +#' @param engine The engine to attach to the returned graph +#' @return A tbl_kgx +neo2r_to_kgx <- function(res, engine) { relationship_ids_contained <- as.integer(unlist(res$paths)) res <- stitch_vectors(res) @@ -103,13 +105,13 @@ cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { # # sapply! # edges_df[[prop_name]] <- sapply(res$relationships, function(edge) { edges_df[[prop_name]] <- sapply(res$relationships, function(edge) { -# edge$properties[[prop_name]] - prop_value <- edge$properties[[prop_name]] - if(is.null(prop_value)) { - return(NA) - } else { - return(prop_value) - } + # edge$properties[[prop_name]] + prop_value <- edge$properties[[prop_name]] + if(is.null(prop_value)) { + return(NA) + } else { + return(prop_value) + } }) } @@ -121,3 +123,27 @@ cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { # attr(g, "relationship_ids") <- relationship_ids_contained return(g) } + +########### Public functions ########### + +#' @export +#' @importFrom neo2R cypher +#' @importFrom neo2R multicypher +#' @importFrom tibble tibble +#' @importFrom tidygraph graph_join +cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { # + if(length(query) == 1) { + res <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "graph") + return(neo2r_to_kgx(res, engine = engine)) + } else { + res <- neo2R::multicypher(engine$graph_conn, queries = query, parameters = parameters, result = "graph") + graphs <- lapply(res, neo2r_to_kgx, engine = engine) + g <- tbl_kgx(nodes = data.frame()) + for(g2 in graphs) { + suppressMessages(g <- tidygraph::graph_join(g, g2), classes = "message") # suppress joining info + } + return(g) + } + + +} diff --git a/R/cypher_query_df.R b/R/cypher_query_df.R index 033ff5b..07883aa 100644 --- a/R/cypher_query_df.R +++ b/R/cypher_query_df.R @@ -3,10 +3,10 @@ #' This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a data frame. #' #' @param engine A neo4j_engine() or derivative providing access to a Neo4j database. -#' @param query A string representing the Cypher query, which should return a table. -#' @param parameters A list of parameters for the Cypher query. Default is an empty list. +#' @param query A string representing the Cypher query, which should return a table. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a list of data frames. +#' @param parameters A list of parameters for the Cypher query, if required. #' @param ... Additional arguments passed to the function. -#' @return The result of the Cypher query as a data frame. +#' @return The result of the Cypher query as a data frame, or a list of data frames if multiple queries are passed. #' @export #' @examplesIf monarch_engine_check() #' engine <- monarch_engine() @@ -17,6 +17,6 @@ #' result <- cypher_query_df(engine, query, parameters) #' print(result) #' @importFrom neo2R cypher -cypher_query_df <- function(engine, query, parameters = list(), ...) { +cypher_query_df <- function(engine, query, parameters = NULL, ...) { UseMethod("cypher_query_df") } diff --git a/R/cypher_query_df.neo4j_engine.R b/R/cypher_query_df.neo4j_engine.R index 0be86b7..d696a76 100644 --- a/R/cypher_query_df.neo4j_engine.R +++ b/R/cypher_query_df.neo4j_engine.R @@ -1,7 +1,12 @@ #' @export #' @importFrom neo2R cypher -cypher_query_df.neo4j_engine <- function(engine, query, parameters = list(), ...) { +#' @importFrom neo2R multicypher +cypher_query_df.neo4j_engine <- function(engine, query, parameters = NULL, ...) { + if(length(query) == 1) { result <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "row", arraysAsStrings = FALSE) + } else { + result <- neo2R::multicypher(engine$graph_conn, queries = query, parameters = parameters, result = "row", arraysAsStrings = FALSE) + } - return(result) + return(result) } diff --git a/R/example_graph.R b/R/example_graph.R new file mode 100644 index 0000000..25a2833 --- /dev/null +++ b/R/example_graph.R @@ -0,0 +1,31 @@ +#' Return an example set of nodes from a KG engine. +#' +#' Given a KG engine, returns a graph representing the diversity +#' of node categories and edge predicates for browsing. The returned graph is guaranteed to +#' contain at least one node of every category, and at least one edge of every +#' predicate. No other guarantees are made: the example graph is not minimal +#' to satisfy these criteria, it is not random or even pseudo-random, and it +#' may not be connected. +#' +#' @param engine A KG engine object +#' @param ... Other parameters (not used) +#' @return A tbl_kgx graph +#' @export +#' @examples +#' # Using example KGX file packaged with monarchr +#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") +#' +#' # prints a readable summary and returns a list of dataframes +#' g <- file_engine(filename) |> example_graph() +#' print(g) +#' +#' @examplesIf monarch_engine_check() +#' # prints a readable summary and returns a list of dataframes +#' g <- monarch_engine() |> example_graph() +#' print(g) +#' @import tidygraph +#' @import dplyr +example_graph <- function(engine, ...) { + UseMethod("example_graph") +} + diff --git a/R/example_graph.file_engine.R b/R/example_graph.file_engine.R new file mode 100644 index 0000000..40e723b --- /dev/null +++ b/R/example_graph.file_engine.R @@ -0,0 +1,99 @@ +#' Return an example set of nodes from a KG engine. +#' +#' Given a KGX file-based KG engine, returns a graph representing the diversity +#' of node categories and edge predicates for browsing. The returned graph is guaranteed to +#' contain at least one node of every category, and at least one edge of every +#' predicate. No other guarantees are made: the example graph is not minimal +#' to satisfy these criteria, it is not random or even pseudo-random, and it +#' may not be connected. +#' +#' @param engine A `file_engine` object +#' @param ... Other parameters (not used) +#' @return A tbl_kgx graph +#' @export +#' @examples +#' # Using example KGX file packaged with monarchr +#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") +#' +#' # Retrieve and print an example graph: +#' g <- file_engine(filename) |> example_graph() +#' print(g) +#' @import tidygraph +#' @import dplyr +example_graph.file_engine <- function(engine, ...) { + # first, let's discover the different edge types (predicates) available + edges_df <- engine$graph |> + activate(edges) |> + as.data.frame() + + nodes_df <- engine$graph |> + activate(nodes) |> + as.data.frame() + + + pred_types <- edges_df |> + pull(predicate) |> + unique() + + # next we get a bunch of edges of the different predicate types as a graph + sample_edges <- edges_df |> + group_by(predicate) |> + slice(1) |> + ungroup() |> + select(-to, -from) + + sample_nodes <- nodes_df |> + filter(id %in% sample_edges$subject | id %in% sample_edges$object) + + sample_preds_graph <- tbl_kgx(nodes = sample_nodes, edges = sample_edges, attach_engine = engine) + + # this might not represent all categories however. + # So we compute the categories that are represented thus far + used_categories <- sample_preds_graph |> + activate(nodes) |> + as.data.frame() |> + pull(category) |> + unlist() |> + unique() + + # get the available categories + all_node_categories <- nodes_df |> + pull(category) |> + unlist() |> + unique() + + + # compute the node categories that are still needed + needed_categories <- setdiff(all_node_categories, used_categories) + + # now sample nodes of those categories, and an arbitrary connection + # trouble is, nodes_df$category is a list column... + sample_cats_node_ids <- needed_categories |> lapply(function(cat) { + has_cat_rows <- which(cat %in_list% nodes_df$category) + return(nodes_df$id[has_cat_rows[1]]) + # unique because a single node may be selected to represent multiple needed categories + }) |> unlist() |> unique() + + # ok, we have ids that cover the needed categories. Let's grab one row from + # the edges table for each (id could be subject or object) + sample_cats_edges_list <- sample_cats_node_ids |> lapply(function(node_id) { + row <- edges_df |> + filter(node_id == subject | node_id == object) |> + head(n = 1) |> + select(-to, -from) + }) + sample_cats_edges <- do.call(rbind, sample_cats_edges_list) + + # now we need to select the corresponding nodes via their ids + sample_cats_all_ids <- c(sample_cats_edges$subject, sample_cats_edges$object) |> + unique() + + sample_cats_nodes <- nodes_df |> + filter(id %in% sample_cats_all_ids) + + # and join it all up + sample_cats_graph <- tbl_kgx(nodes = sample_cats_nodes, edges = sample_cats_edges, attach_engine = engine, ) + + suppressMessages(all <- kg_join(sample_cats_graph, sample_preds_graph), classes = "message") + return(all) +} diff --git a/R/example_graph.neo4j_engine.R b/R/example_graph.neo4j_engine.R new file mode 100644 index 0000000..5a7b6f5 --- /dev/null +++ b/R/example_graph.neo4j_engine.R @@ -0,0 +1,56 @@ +#' Return an example set of nodes from a KG engine. +#' +#' Given a KGX Neo4j KG engine, returns a graph representing the diversity +#' of node categories and edge predicates for browsing. The returned graph is guaranteed to +#' contain at least one node of every category, and at least one edge of every +#' predicate. No other guarantees are made: the example graph is not minimal +#' to satisfy these criteria, it is not random or even pseudo-random, and it +#' may not be connected. +#' +#' @param engine A `neo4j_engine` object +#' @param ... Other parameters (not used) +#' @return A tbl_kgx graph +#' @export +#' @examplesIf monarch_engine_check() +#' # Retrieve and print an example graph: +#' g <- monarch_engine() |> example_graph() +#' print(g) +#' @import tidygraph +#' @import dplyr +example_graph.neo4j_engine <- function(engine, ...) { + # first, let's discover the different edge types (predicates) available from the schema info + pred_types_query <- "CALL db.schema.visualization() YIELD relationships + UNWIND relationships AS rel + RETURN DISTINCT type(rel) AS predicate" + pred_types <- cypher_query_df(engine, pred_types_query) + + # next we get a bunch of edges of the different predicate types as a graph + sample_preds_query <- paste0("MATCH (a)-[r:`", pred_types$predicate, "`]->(b) RETURN a, b, r LIMIT 1") + sample_preds_graph <- cypher_query(engine, query = sample_preds_query) + + # this might not represent all categories however. + + # So we compute the categories that are represented thus far + used_categories <- sample_preds_graph |> + activate(nodes) |> + as.data.frame() |> + pull(category) |> + unlist() |> + unique() + + # get the available categories from the schema + categories_query <- "CALL db.labels() YIELD label RETURN DISTINCT label" + all_node_categories <- cypher_query_df(engine, categories_query)$label + + # compute the node categories that are still needed + needed_categories <- setdiff(all_node_categories, used_categories) + + # now sample nodes of those categories, and an arbitrary connection + sample_cats_query <- paste0("MATCH (a:`", needed_categories, "`) -[r]- (b) RETURN a, r, b LIMIT 1") + sample_new_cats <- cypher_query(engine, query = sample_cats_query) + + # finally, we join the two and return + suppressMessages(full_sample <- kg_join(sample_preds_graph, sample_new_cats), classes = "message") + + return(full_sample) +} diff --git a/R/summary.file_engine.R b/R/summary.file_engine.R new file mode 100644 index 0000000..4124e9b --- /dev/null +++ b/R/summary.file_engine.R @@ -0,0 +1,92 @@ +#' Summarize contents of a KGX-file-based KG engine +#' +#' Given a KGX file-based KG engine, provides summary information in the form of +#' node counts, category counts across nodes, and relationship type counts. +#' General information about the graph is printed to the console, and a list of +#' dataframes describing node and edge counts is returned invisibly. Also returned +#' are `cats` and `preds` entries, containing lists of available node categories and +#' edge predicates, respectively, for convenient auto-completion in RStudio. +#' +#' @param engine A `file_engine` object +#' @param ... Other parameters (not used) +#' @param quiet Logical, whether to suppress printing of the summary +#' @return A list of dataframes and named lists +#' @export +#' @examples +#' # Using example KGX file packaged with monarchr +#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") + +#' # prints a readable summary and returns a list of dataframes +#' res <- file_engine(filename) |> summary() +#' print(res) +#' @import tidygraph +#' @import dplyr +summary.file_engine <- function(engine, ..., quiet = FALSE) { + if(!quiet) { + cat("\n") + cat("A KGX file-backed knowledge graph engine.\n") + } + + g <- engine$graph + + total_nodes <- g |> + activate(nodes) |> + as.data.frame() |> + nrow() + + total_edges <- g |> + activate(edges) |> + as.data.frame() |> + nrow() + + all_node_cats <- g |> + activate(nodes) |> + as.data.frame() |> + pull(category) |> + unlist() |> + sort() |> + rle() + + node_summary_df <- data.frame(category = all_node_cats$values, + count = all_node_cats$lengths) |> + arrange(desc(count)) + + all_edge_predicates <- g |> + activate(edges) |> + as.data.frame() |> + pull(predicate) |> + sort() |> + rle() + + edge_summary_df <- data.frame(predicate = all_edge_predicates$values, + count = all_edge_predicates$lengths) |> + arrange(desc(count)) + + + if(!quiet) { + cat("Total nodes: ", total_nodes, "\n") + cat("Total edges: ", total_edges, "\n") + cat("\n") + cat("Node category counts:\n") + # print the data frame without row names + print(node_summary_df, row.names = FALSE) + cat("\n") + cat("Edge type counts:\n") + # print the data frame without row names + print(edge_summary_df, row.names = FALSE) + } + + + cats <- as.list(node_summary_df$category) + names(cats) <- cats + + preds <- as.list(edge_summary_df$predicate) + names(preds) <- preds + + return(invisible(list(node_summary = node_summary_df, + edge_summary = edge_summary_df, + total_nodes = total_nodes, + total_edges = total_edges, + cats = cats, + preds = preds))) +} diff --git a/R/summary.neo4j_engine.R b/R/summary.neo4j_engine.R new file mode 100644 index 0000000..dd9b9b4 --- /dev/null +++ b/R/summary.neo4j_engine.R @@ -0,0 +1,76 @@ +#' Summarize contents of a Neo4j KG engine +#' +#' Given a Neo4j based KG engine, provides summary information in the form of +#' node counts, category counts across nodes, and relationship type counts. +#' General information about the graph is printed to the console, and a list of +#' dataframes describing node and edge counts is returned invisibly. Also returned +#' are `cats` and `preds` entries, containing lists of available node categories and +#' edge predicates, respectively, for convenient auto-completion in RStudio. +#' +#' @param engine A `neo4j_engine` object +#' @param ... Other parameters (not used) +#' @param quiet Logical, whether to suppress printing of the summary +#' @return A list of dataframes and named lists +#' @export +#' @examplesIf monarch_engine_check() +#' # prints a readable summary and returns a list of dataframes +#' stats <- monarch_engine() |> summary() +#' print(stats) +#' +summary.neo4j_engine <- function(engine, ..., quiet = FALSE) { + if(!quiet) { + cat("\n") + cat("A Neo4j-backed knowledge graph engine.\n") + cat("Gathering statistics, please wait...\n") + } + + # possible optimization: use a schema query to get different available categories, + # count them individually: + # cat_counts_query <- paste0("MATCH (a:`", all_node_categories, "`) WITH count(*) as count, '", all_node_categories ,"' as category RETURN category, count") + # cat_counts <- cypher_query_df(e, cat_counts_query) + # cat_counts_df <- do.call(rbind, cat_counts) |> arrange(desc(count)) + + + node_summary_df <- cypher_query_df(engine, "MATCH (n) UNWIND labels(n) AS category WITH category, COUNT(n) AS count RETURN category, count ORDER BY count DESC") + edge_summary_df <- cypher_query_df(engine, "MATCH ()-[r]->() RETURN type(r) AS predicate, COUNT(*) AS count ORDER BY count DESC") + + counts_query <- " + // Count the total number of nodes + MATCH (n) + RETURN 'nodes_total' AS Type, COUNT(n) AS Count + UNION + // Count the total number of edges + MATCH ()-[r]->() + RETURN 'edges_total' AS Type, COUNT(r) AS Count + " + + total_df <- cypher_query_df(engine, counts_query) + total_nodes <- total_df$Count[1] + total_edges <- total_df$Count[2] + + if(!quiet) { + cat("Total nodes: ", total_nodes, "\n") + cat("Total edges: ", total_edges, "\n") + cat("\n") + cat("Node category counts:\n") + # print the data frame without row names + print(node_summary_df, row.names = FALSE) + cat("\n") + cat("Edge type counts:\n") + # print the data frame without row names + print(edge_summary_df, row.names = FALSE) + } + + cats <- as.list(node_summary_df$category) + names(cats) <- cats + + preds <- as.list(edge_summary_df$predicate) + names(preds) <- preds + + return(invisible(list(node_summary = node_summary_df, + edge_summary = edge_summary_df, + total_nodes = total_nodes, + total_edges = total_edges, + cats = cats, + preds = preds))) +} diff --git a/man/cypher_query.Rd b/man/cypher_query.Rd index bde8781..0e2b3c4 100644 --- a/man/cypher_query.Rd +++ b/man/cypher_query.Rd @@ -9,7 +9,7 @@ cypher_query(engine, query, parameters = NULL, ...) \arguments{ \item{engine}{A neo4j KG engine} -\item{query}{A string representing the Cypher query.} +\item{query}{A string representing the Cypher query. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a single joined graph.} \item{parameters}{A list of parameters for the Cypher query. Default is an empty list.} diff --git a/man/cypher_query_df.Rd b/man/cypher_query_df.Rd index 36762a5..b8f90e1 100644 --- a/man/cypher_query_df.Rd +++ b/man/cypher_query_df.Rd @@ -4,19 +4,19 @@ \alias{cypher_query_df} \title{Execute a Cypher Query} \usage{ -cypher_query_df(engine, query, parameters = list(), ...) +cypher_query_df(engine, query, parameters = NULL, ...) } \arguments{ \item{engine}{A neo4j_engine() or derivative providing access to a Neo4j database.} -\item{query}{A string representing the Cypher query, which should return a table.} +\item{query}{A string representing the Cypher query, which should return a table. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a list of data frames.} -\item{parameters}{A list of parameters for the Cypher query. Default is an empty list.} +\item{parameters}{A list of parameters for the Cypher query, if required.} \item{...}{Additional arguments passed to the function.} } \value{ -The result of the Cypher query as a data frame. +The result of the Cypher query as a data frame, or a list of data frames if multiple queries are passed. } \description{ This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a data frame. diff --git a/man/example_graph.Rd b/man/example_graph.Rd new file mode 100644 index 0000000..5c38678 --- /dev/null +++ b/man/example_graph.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/example_graph.R +\name{example_graph} +\alias{example_graph} +\title{Return an example set of nodes from a KG engine.} +\usage{ +example_graph(engine, ...) +} +\arguments{ +\item{engine}{A KG engine object} + +\item{...}{Other parameters (not used)} +} +\value{ +A tbl_kgx graph +} +\description{ +Given a KG engine, returns a graph representing the diversity +of node categories and edge predicates for browsing. The returned graph is guaranteed to +contain at least one node of every category, and at least one edge of every +predicate. No other guarantees are made: the example graph is not minimal +to satisfy these criteria, it is not random or even pseudo-random, and it +may not be connected. +} +\examples{ +# Using example KGX file packaged with monarchr +filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") + +# prints a readable summary and returns a list of dataframes +g <- file_engine(filename) |> example_graph() +print(g) + +\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# prints a readable summary and returns a list of dataframes +g <- monarch_engine() |> example_graph() +print(g) +\dontshow{\}) # examplesIf} +} diff --git a/man/example_graph.file_engine.Rd b/man/example_graph.file_engine.Rd new file mode 100644 index 0000000..c72c792 --- /dev/null +++ b/man/example_graph.file_engine.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/example_graph.file_engine.R +\name{example_graph.file_engine} +\alias{example_graph.file_engine} +\title{Return an example set of nodes from a KG engine.} +\usage{ +\method{example_graph}{file_engine}(engine, ...) +} +\arguments{ +\item{engine}{A \code{file_engine} object} + +\item{...}{Other parameters (not used)} +} +\value{ +A tbl_kgx graph +} +\description{ +Given a KGX file-based KG engine, returns a graph representing the diversity +of node categories and edge predicates for browsing. The returned graph is guaranteed to +contain at least one node of every category, and at least one edge of every +predicate. No other guarantees are made: the example graph is not minimal +to satisfy these criteria, it is not random or even pseudo-random, and it +may not be connected. +} +\examples{ +# Using example KGX file packaged with monarchr +filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") + +# Retrieve and print an example graph: +g <- file_engine(filename) |> example_graph() +print(g) +} diff --git a/man/example_graph.neo4j_engine.Rd b/man/example_graph.neo4j_engine.Rd new file mode 100644 index 0000000..ee758cb --- /dev/null +++ b/man/example_graph.neo4j_engine.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/example_graph.neo4j_engine.R +\name{example_graph.neo4j_engine} +\alias{example_graph.neo4j_engine} +\title{Return an example set of nodes from a KG engine.} +\usage{ +\method{example_graph}{neo4j_engine}(engine, ...) +} +\arguments{ +\item{engine}{A \code{neo4j_engine} object} + +\item{...}{Other parameters (not used)} +} +\value{ +A tbl_kgx graph +} +\description{ +Given a KGX Neo4j KG engine, returns a graph representing the diversity +of node categories and edge predicates for browsing. The returned graph is guaranteed to +contain at least one node of every category, and at least one edge of every +predicate. No other guarantees are made: the example graph is not minimal +to satisfy these criteria, it is not random or even pseudo-random, and it +may not be connected. +} +\examples{ +\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Retrieve and print an example graph: +g <- monarch_engine() |> example_graph() +print(g) +\dontshow{\}) # examplesIf} +} diff --git a/man/neo2r_to_kgx.Rd b/man/neo2r_to_kgx.Rd new file mode 100644 index 0000000..b46dc45 --- /dev/null +++ b/man/neo2r_to_kgx.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cypher_query.neo4j_engine.R +\name{neo2r_to_kgx} +\alias{neo2r_to_kgx} +\title{Process neo2R cypher to tbl_kgx} +\usage{ +neo2r_to_kgx(res, engine) +} +\arguments{ +\item{res}{The result from neo2R::cypher with result = "graph"} + +\item{engine}{The engine to attach to the returned graph} +} +\value{ +A tbl_kgx +} +\description{ +Given a result from neo2R::cypher returning KGX-formatted nodes and edges, +parse the result to generate a tbl_kgx object, attaching the provided engine. +} diff --git a/man/summary.file_engine.Rd b/man/summary.file_engine.Rd new file mode 100644 index 0000000..3742787 --- /dev/null +++ b/man/summary.file_engine.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summary.file_engine.R +\name{summary.file_engine} +\alias{summary.file_engine} +\title{Summarize contents of a KGX-file-based KG engine} +\usage{ +\method{summary}{file_engine}(engine, ..., quiet = FALSE) +} +\arguments{ +\item{engine}{A \code{file_engine} object} + +\item{...}{Other parameters (not used)} + +\item{quiet}{Logical, whether to suppress printing of the summary} +} +\value{ +A list of dataframes +} +\description{ +Given a KGX file-based KG engine, provides summary information in the form of +node counts, category counts across nodes, and relationship type counts. +General information about the graph is printed to the console, and a list of +dataframes describing node and edge counts is returned invisibly. +} +\examples{ +# Using example KGX file packaged with monarchr +filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") +# prints a readable summary and returns a list of dataframes +res <- file_engine(filename) |> summary() +print(res) +} diff --git a/man/summary.neo4j_engine.Rd b/man/summary.neo4j_engine.Rd new file mode 100644 index 0000000..924161c --- /dev/null +++ b/man/summary.neo4j_engine.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summary.neo4j_engine.R +\name{summary.neo4j_engine} +\alias{summary.neo4j_engine} +\title{Summarize contents of a Neo4j KG engine} +\usage{ +\method{summary}{neo4j_engine}(engine, ..., quiet = FALSE) +} +\arguments{ +\item{engine}{A \code{neo4j_engine} object} + +\item{...}{Other parameters (not used)} + +\item{quiet}{Logical, whether to suppress printing of the summary} +} +\value{ +A list of dataframes +} +\description{ +Given a Neo4j based KG engine, provides summary information in the form of +node counts, category counts across nodes, and relationship type counts. +General information about the graph is printed to the console, and a list of +dataframes describing node and edge counts is returned invisibly. +} +\examples{ +\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# prints a readable summary and returns a list of dataframes +stats <- monarch_engine() |> summary() +print(stats) +\dontshow{\}) # examplesIf} +} diff --git a/meta/dev_tests.R b/meta/dev_tests.R index 9d55e5b..b0d2adf 100644 --- a/meta/dev_tests.R +++ b/meta/dev_tests.R @@ -1,96 +1,161 @@ +devtools::load_all() +f <- file_engine(system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")) +gs <- f |> + fetch_nodes(name %~% "Marfan") |> + expand() +g <- gs |> + expand() -The `expand` function will fetch edges edges between nodes in the query as well; for example, if we want to see if any of the top 5 hits for Fanconi anemia are directly connected, we can begin by getting all of the edges associated with them, storing the result in a separate graph, and taking the intersection of their node sets to filter back down to keep just the original nodes but all the edges between them. +# The `expand` function will fetch edges edges between nodes in the query as well; for example, if we want to see if any of the top 5 hits for Fanconi anemia are directly connected, we can begin by getting all of the edges associated with them, storing the result in a separate graph, and taking the intersection of their node sets to filter back down to keep just the original nodes but all the edges between them. -```{r} -g <- monarch_search("Ehlers-danlos syndrome", limit = 5) +# g <- monarch_search("Ehlers-danlos syndrome", limit = 5) -expanded <- g |> - expand() +# expanded <- g |> +# expand() -joined <- expanded |> - activate(nodes) |> - inner_join(nodes(g)) +# joined <- expanded |> +# activate(nodes) |> +# inner_join(nodes(g)) -joined -``` +# joined -Let's do something more complicated: let's fetch all the genes related to FA and EDS, and all the phenotypes associated with either of those diseases directly or via those genes. +# Let's do something more complicated: let's fetch all the genes related to FA and EDS, and all the phenotypes associated with either of those diseases directly or via those genes. -```{r} -fa <- monarch_search("Fanconi anemia", limit = 1) %>% - expand(result_) -``` +# fa <- monarch_search("Fanconi anemia", limit = 1) %>% +# expand(result_) -Let's visualize that: -```{r fig.height=5, fig.width=7} -ggraph(joined, layout = "fr") + - geom_edge_link() + - geom_node_point(aes(color = pcategory, label = wrap(name))) + - theme_graph() + - theme(legend.position = 'bottom') +# Let's visualize that: -``` +# ggraph(joined, layout = "fr") + +# geom_edge_link() + +# geom_node_point(aes(color = pcategory, label = wrap(name))) + +# theme_graph() + +# theme(legend.position = 'bottom') -Let's visualize that: - ```{r eval=FALSE} -joined <- joined |> - mutate(tooltip = paste0(name, "\n\n", paste(strwrap(description, 80), collapse = "\n"))) |> - mutate(tooltip = paste0(name, "\n\n", pcategory, "\n", iri )) +# # Let's visualize that: +# joined <- joined |> +# mutate(tooltip = paste0(name, "\n\n", paste(strwrap(description, 80), collapse = "\n"))) |> +# mutate(tooltip = paste0(name, "\n\n", pcategory, "\n", iri )) +# # Load required libraries library(digest) library(RColorBrewer) - -# Function to map factor levels or character vector to colors -color_cats <- function(factors) { +library(dplyr) +library(tidygraph) + +# Generates a palette with num_colors entries, mapping +# inputs pseudorandomly to them. if levels_only = FALSE, +# a vector of RGB values the same length as input is returned. +# if levels_only = TRUE, a named vector is returned mapping input +# levels to RGB values +color_cats <- function(input, num_colors = 16, levels_only = TRUE) { # Ensure the input is treated as a factor - factors <- factor(factors) + factors <- factor(input) - # Choose a color palette (e.g., Set3) with a reasonably large number of distinct colors - palette_name <- "Set3" - palette <- brewer.pal(name = palette_name, n = min(length(unique(factors)), brewer.pal.info[palette_name, "max"])) + palette <- grDevices::hcl.colors(num_colors, palette = "Set3") - # Hash function to convert factor levels to numeric values consistently - hashes <- lapply(levels(factors), function(x) digest(x, algo = "crc32", serialize = FALSE)) + # Hash function to convert factor levels to numeric values consistently + hashes <- lapply(levels(factors), function(x) digest::digest(x, algo = "crc32", serialize = FALSE)) hash_integers <- sapply(hashes, function(x) strtoi(substr(x, 1, 5), base=16)) # Map hashes to indices in the color palette # Use modulo to wrap around if there are more factors than colors color_indices <- (hash_integers %% length(palette)) + 1 - color_map <- setNames(palette[color_indices], levels(factors)) - # Return the colors corresponding to the input factors - return(color_map[as.character(factors)]) + color_map <- setNames(palette[color_indices], levels(factors)) + if(levels_only) { + return(color_map) + } else { + # Return the colors corresponding to the input + return(unname(color_map[as.character(input)])) + } +} +# g |> nodes() |> pull(pcategory) |> factor() |> levels() |> head(n = 4) |> color_cats() +# +# +# # # Example usage +# # factors <- c("Apple", "Banana", "Cherry", "Date", "Apple", "Banana") +# # colors <- map_factors_to_colors(factors) +# +# # # Print the mapping +# # print(colors) +# +# +# +# # jexp <- joined |> +# # expand(categories = "biolink:PhenotypicFeature") +# +# +# library(visNetwork) +# visNetwork(nodes(g) %>% +# mutate(kg_id = id) %>% +# mutate(id = 1:nrow(nodes(g))) %>% +# mutate(color = pal), +# edges(g)) %>% +# visEdges(shadow = TRUE) +# +# +# program <- "document.addEventListener('mousemove', function(e) { +# let x = document.getElementsByClassName('infobox')[3]; +# x.style['background'] = '#222222'; +# x.style['border-radius'] = '5px'; +# x.style['color'] = '#222'; +# x.style['font-family'] = 'sans-serif'; +# x.style['position'] = 'absolute'; +# x.style['top'] = e.pageY + 'px'; +# x.style['left'] = e.pageX + 'px'; +# })" +# +# +# library(rthreejs) +# graphjs(gs, +# vertex.color = as.character(pal), +# vertex.size = 0.5, vertex.label = nodes(g)$name) +# +# +# library(RedeR) +# startRedeR() +# addGraphToRedeR(g=gs) + + +cytoscape <- function(g) { + RCy3::cytoscapePing() + + nodes_df <- nodes(g) + nodes_df$desc_wrapped <- stringr::str_wrap(nodes_df$description, 50) + + edges_df <- edges(g) + + RCy3::createNetworkFromDataFrames(nodes_df, + edges_df, + title = "KG Nodes", + collection = "monarchr Graphs", + source.id.list = 'subject', + target.id.list = 'object') + RCy3::layoutNetwork('kamada-kawai') + + pal <- color_cats(nodes(g)$pcategory, levels_only = TRUE) + RCy3::setNodeColorMapping('pcategory', table.column.values = names(pal), colors = pal, mapping.type = 'd') + + RCy3::setNodeTooltipMapping(table.column = 'desc_wrapped') } -# Example usage -factors <- c("Apple", "Banana", "Cherry", "Date", "Apple", "Banana") -colors <- map_factors_to_colors(factors) -# Print the mapping -print(colors) +cytoscape(g) -jexp <- joined |> - expand(categories = "biolink:PhenotypicFeature") +library(ggraph) +library(ggiraph) -library(visNetwork) -visNetwork(nodes(joined) #%>% - #mutate(kg_id = id)# %>% - #mutate(id = 1:nrow(nodes(jexp))) #%>% - # mutate(color = color_cats(pcategory)) - , - edges(joined)) %>% - visEdges(shadow = TRUE) - -z <- ggraph(joined, layout = 'fr') + # fr +z <- ggraph(g, layout = 'fr') + # fr geom_edge_link(aes(alpha = after_stat(index), color = predicate)) + # geom_node_point(aes(color = pcategory)) + @@ -99,7 +164,7 @@ z <- ggraph(joined, layout = 'fr') + # fr y = y, color = pcategory, data_id = id, - tooltip = tooltip) + tooltip = name) ) + scale_edge_alpha('Edge direction', guide = 'edge_direction') + theme_graph() + @@ -111,80 +176,24 @@ girafe(ggobj = z, height_svg = 3, width_svg = 5, opts_tooltip(css = "font-family: sans-serif; background: #333333; padding: 10px; font-size: small") )) -str(z) -``` - -```{r eval=FALSE} -subtypes <- g %>% - expand(direction = "in", predicates = "biolink:subclass_of", transitive = TRUE) - -with_phenos <- subtypes %>% - expand(predicates = "biolink:has_phenotype") - -# print(with_phenos) - -with_genes <- subtypes %>% - expand(categories = "biolink:Gene") - -# print(with_genes) - -neighbors <- graph_join(with_phenos, with_genes) -neighbors -``` - -```{r eval=FALSE} -library(ggraph) - -neighbors <- neighbors %>% - activate(nodes) - -p <- ggraph(neighbors, layout = 'fr') + - geom_edge_link() + - geom_node_point(aes(color = pcategory)) + - theme(legend.position = 'bottom') - -plot(p) -``` +# str(z) +# +# subtypes <- g %>% +# expand(direction = "in", predicates = "biolink:subclass_of", transitive = TRUE) +# +# with_phenos <- subtypes %>% +# expand(predicates = "biolink:has_phenotype") +# +# # print(with_phenos) +# +# with_genes <- subtypes %>% +# expand(categories = "biolink:Gene") +# +# # print(with_genes) +# +# neighbors <- graph_join(with_phenos, with_genes) +# neighbors -```{r eval=FALSE} -library(ggraph) - -eds_phenos <- monarch_search("Ehlers-danlos syndrome", limit = 1) %>% - expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>% - expand(categories = "biolink:PhenotypicFeature") - -# eds_phenos - -fanconi_phenos <- monarch_search("Fanconi anemia", limit = 1) %>% - expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>% - expand(categories = "biolink:PhenotypicFeature") - -# fanconi_phenos - -inner <- inner_join(activate(eds_phenos, nodes), activate(fanconi_phenos, nodes) %>% as_tibble()) -print(inner) - -p <- ggraph(inner, layout = 'fr') + - geom_edge_link() + - geom_node_point(aes(color = pcategory)) + - theme(legend.position = 'bottom') - -plot(p) - -both <- graph_join(eds_phenos, fanconi_phenos) - - -p <- ggraph(both, layout = 'fr') + - geom_edge_link() + - geom_node_point(aes(color = pcategory)) + - theme(legend.position = 'bottom') - - -plot(p) - -``` - -```{r eval=FALSE} library(visNetwork) library(igraph) library(ggiraph) @@ -192,13 +201,13 @@ library(ggraph) # x <- visNetwork(g %N>% as_tibble(), g %E>% as_tibble(), height = "400px") %>% # visNodes(color = list(background = "")) -g <- monarch_search("Fanconi anemia", limit = 1) %>% - expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>% - expand(categories = "biolink:PhenotypicFeature") - -g <- g %>% - activate(nodes) %>% - mutate(tooltip = paste0(name, "\n", description)) +# g <- monarch_search("Fanconi anemia", limit = 1) %>% +# expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>% +# expand(categories = "biolink:PhenotypicFeature") +# +# g <- g %>% +# activate(nodes) %>% +# mutate(tooltip = paste0(name, "\n", description)) z <- ggraph(g, layout = 'fr') + # fr # geom_edge_link(aes(alpha = after_stat(index), @@ -211,7 +220,7 @@ z <- ggraph(g, layout = 'fr') + # fr y = y, color = pcategory, data_id = id, - tooltip = tooltip) + tooltip = name) ) + scale_edge_alpha('Edge direction', guide = 'edge_direction') + theme_graph() + @@ -223,4 +232,139 @@ girafe(ggobj = z, width_svg = 5, height_svg = 5, opts_zoom(max = 5) )) -``` + + +############################# + +devtools::load_all() +# counts by sets - formatted for use with UpSetR +res <- monarch_engine() |> + cypher_query_df("MATCH (n) WITH labels(n) AS LabelList, COUNT(n) AS Count WITH LabelList, Count, REDUCE(s = HEAD(LabelList), x IN TAIL(LabelList) | s + '&' + x) AS LabelCombination RETURN LabelCombination, Count", parameters = list(nodes = list("NCBIGene:2876645"))) + +res + +# counts by sets - catories returned a list col +res <- monarch_engine() |> + cypher_query_df("MATCH (n) WITH labels(n) AS LabelList, COUNT(n) AS Count RETURN LabelList, Count") + +res + + +input <- res$Count +names(input) <- res$LabelCombination + +library(UpSetR) + +# Create the UpSet plot +upset(fromExpression(input), + order.by = "freq", + nsets = 20, + nintersects = NA) + + +# node category counts +cat_counts <- monarch_engine() |> + cypher_query_df("MATCH (n) UNWIND labels(n) AS Label WITH Label, COUNT(n) AS Count RETURN Label, Count ORDER BY Count DESC") + +cat_counts + +# relationship type counts + +pred_counts <- monarch_engine() |> + cypher_query_df("MATCH ()-[r]->() RETURN type(r) AS RelationshipType, COUNT(*) AS RelationshipCount ORDER BY RelationshipCount DESC") + +pred_counts + + +# category set counts for a single relationship type +cat_set_rel_counts <- monarch_engine() |> + cypher_query_df("MATCH (source)-[rel:`biolink:has_phenotype`]->(target) WITH labels(source) AS SourceLabelList, type(rel) AS Relationship, labels(target) AS DestLabelList, COUNT(*) AS Count RETURN SourceLabelList, Relationship, DestLabelList, Count ORDER BY Count DESC") + +cat_set_rel_counts + + + +# property counts on nodes - random examples +# MATCH (n:`biolink:Disease`) +# WITH n, keys(n) AS properties +# UNWIND properties AS property +# WITH property, COUNT(n[property]) AS count, collect([n[property], n.id]) AS examples +# WITH property, count, examples, rand() AS random +# RETURN property, count, examples[TOINTEGER(random * SIZE(examples))][0] AS example, examples[TOINTEGER(random * SIZE(examples))][1] AS example_id +# ORDER BY count DESC + +# property counts on edges - random examples +# MATCH ()-[r:`biolink:causes`]->() +# WITH r, keys(r) AS properties +# UNWIND properties AS property +# WITH property, COUNT(r[property]) AS count, collect([r[property], startNode(r).id, endNode(r).id]) AS examples +# WITH property, count, examples, rand() AS random +# WITH property, count, examples[TOINTEGER(random * SIZE(examples))] AS example_data +# RETURN property, count, example_data[0] AS example, example_data[1] AS source_id, example_data[2] AS target_id +# ORDER BY count DESC + + +# +query <- " +// Directly refer known primary categories +WITH $categories AS KnownCategories +UNWIND KnownCategories AS CategoryA +UNWIND KnownCategories AS CategoryB +WITH DISTINCT CategoryA, CategoryB + +// Match and count relationships using primary known categories +MATCH (a:CategoryA)-[r:`biolink:interacts_with`]->(b:CategoryB) +RETURN CategoryA, type(r) AS RelationshipType, CategoryB, COUNT(*) AS RelationshipCount +ORDER BY RelationshipCount DESC +" + +interacts_counts <- cypher_query_df(monarch_engine(), query, parameters = list(categories = as.list(cat_counts$Label))) +interacts_counts + + +opts <- list(category = as.list(res$Label)) +names(opts$category) <- res$Label + +# property counts for nodes of type biolink:GeneOrGeneProduct +res <- monarch_engine() |> + cypher_query_df("MATCH (n) + WHERE $label IN n.category + WITH n, keys(n) AS properties + UNWIND properties AS property + WITH property, COUNT(n[property]) AS count, collect([n[property], n.id])[0] AS example_data + RETURN property, count, example_data[0] AS example, example_data[1] AS example_id + ORDER BY count DESC", + parameters = list(label = "biolink:GeneOrGeneProduct")) + + +res + + +# schema graph - not compatible with KGX +res <- monarch_engine() |> + cypher_query("CALL db.schema.visualization() YIELD nodes, relationships RETURN [node IN nodes | node {.*, id: id(node)}] AS nodes, relationships") + + +queries <- paste0("MATCH (n:`",res$Label,"`) RETURN n LIMIT 1") + +monarch <- monarch_engine() + +fetched <- cypher_query(monarch, queries = queries) + + +###### diversity sampling... + + + + +s <- monarch_engine() |> example_graph.neo4j_engine() + +cytoscape(s) + + +f <- file_engine(system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")) + + +s2 <- f |> example_graph.file_engine() +s2 |> cytoscape() + diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 0e1ecb8..3bd8da8 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -8,3 +8,4 @@ articles: navbar: Examples contents: - examples/alzheimers_phenotypes + - examples/exploring_kgs diff --git a/tests/testthat/test-cypher_query.R b/tests/testthat/test-cypher_query.R index 24db23a..5843a8c 100644 --- a/tests/testthat/test-cypher_query.R +++ b/tests/testthat/test-cypher_query.R @@ -13,4 +13,21 @@ test_that("cypher_query returns a graph object", { # g should be a tidygraph with two rows in nodes and one in edges expect_equal(nrow(nodes_df), 2) expect_equal(nrow(edges_df), 1) -}) \ No newline at end of file +}) + +test_that("cypher_query works with multicypher queries", { + # skip for now + #testthat::skip("temporary skip") + + e <- monarch_engine() + queries <- c("MATCH (n {id: 'MONDO:0007947'}) RETURN n", + "MATCH (n {id: 'MONDO:0017309'}) RETURN n", + "MATCH (n {id: 'MONDO:0020066'}) RETURN n") + g <- cypher_query(e, query = queries) + expect_s3_class(g, "tbl_kgx") + nodes_df <- data.frame(tidygraph::activate(g, nodes)) + edges_df <- data.frame(tidygraph::activate(g, edges)) + # g should be a tidygraph with two rows in nodes and one in edges + expect_equal(nrow(nodes_df), 3) + expect_equal(nrow(edges_df), 0) +}) diff --git a/tests/testthat/test-example_graph.file_engine.R b/tests/testthat/test-example_graph.file_engine.R new file mode 100644 index 0000000..8d3ff90 --- /dev/null +++ b/tests/testthat/test-example_graph.file_engine.R @@ -0,0 +1,27 @@ +library(testthat) +library(assertthat) + +test_that("example_graph for file engine", { + #testthat::skip("temporary skip") + options(width = 150) + + filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") + e <- file_engine(filename) + + sample <- example_graph(e) + + # check some expected categories + expect_true(any("biolink:Disease" %in_list% nodes(sample)$category)) + expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category)) + expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category)) + expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category)) + expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category)) + expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category)) + + # check some expected predicates + expect_true(any("biolink:causes" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate)) +}) diff --git a/tests/testthat/test-example_graph.neo4j_engine.R b/tests/testthat/test-example_graph.neo4j_engine.R new file mode 100644 index 0000000..43a2f0c --- /dev/null +++ b/tests/testthat/test-example_graph.neo4j_engine.R @@ -0,0 +1,26 @@ +library(testthat) +library(assertthat) + +test_that("example_graph for neo4j engine", { + #testthat::skip("temporary skip") + options(width = 150) + + e <- monarch_engine() + + sample <- example_graph(e) + + # check some expected categories + expect_true(any("biolink:Disease" %in_list% nodes(sample)$category)) + expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category)) + expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category)) + expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category)) + expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category)) + expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category)) + + # check some expected predicates + expect_true(any("biolink:causes" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate)) + expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate)) +}) diff --git a/tests/testthat/test-summary.file_engine.R b/tests/testthat/test-summary.file_engine.R new file mode 100644 index 0000000..aeec678 --- /dev/null +++ b/tests/testthat/test-summary.file_engine.R @@ -0,0 +1,20 @@ +library(testthat) +library(assertthat) + +test_that("summary() for file_engine", { + filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") + engine <- file_engine(filename) + + res <- summary(engine, quiet = TRUE) + + # make sure the output is a list + expect_type(res, "list") + + # let's try a version where we capture the printed output + printed <- capture.output(summary(engine, quiet = FALSE)) + + # the result should be a character vector + expect_type(printed, "character") + # one of the lines should be "Total nodes: " + expect_true(any(grepl("Total nodes: ", printed))) +}) diff --git a/tests/testthat/test-summary.neo4j_engine.R b/tests/testthat/test-summary.neo4j_engine.R new file mode 100644 index 0000000..a39411e --- /dev/null +++ b/tests/testthat/test-summary.neo4j_engine.R @@ -0,0 +1,17 @@ +library(testthat) +library(assertthat) + +test_that("summary() for neo4j_engine", { + res <- summary(monarch_engine(), quiet = TRUE) + + # make sure the output is a list + expect_type(res, "list") + + # let's try a version where we capture the printed output + printed <- capture.output(summary(monarch_engine(), quiet = FALSE)) + + # the result should be a character vector + expect_type(printed, "character") + # one of the lines should be "Total nodes: " + expect_true(any(grepl("Total nodes: ", printed))) +}) diff --git a/vignettes/examples/exploring_kgs.Rmd b/vignettes/examples/exploring_kgs.Rmd new file mode 100644 index 0000000..085ecf1 --- /dev/null +++ b/vignettes/examples/exploring_kgs.Rmd @@ -0,0 +1,128 @@ +--- +title: "Exploring Knowledge Graphs" +author: "Shawn T O'Neil" +date: "Vignette updated: `r format( Sys.Date(), '%b-%d-%Y')`" +output: + rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{monarchr} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +```{r, include=FALSE, message=FALSE} +options(width = 300) +knitr::opts_chunk$set(eval = TRUE, echo = TRUE, fig.width = 10, message = FALSE, warning = FALSE) +``` + +Knowledge Graphs (KGs) may contain large amounts of information; the Monarch Initiative KG for example +contains not only millions of nodes and edges, but each node may belong to one or more *categories*, across +dozens of available categories. While edges may only have a single *predicate* linking a subject and object +node, there are similarly dozens of available predicates. Each node category and edge predicate may further +come with other node or edge properties, and these may be shared across some but not all node categories or +edge predicates. + +To help navigate this extensive information, `monarchr` provides two functions that may be applied to KG +engines: a `summary()` function that counts these categories and predicates across nodes and edges, and +an `example_graph()` function that returns a (non-random) subgraph gauranteed to represent every node +category and edge predicate. + +As usual, we being by loading the `monarchr` package, along with `tidygraph` and `dplyr` which tend to be useful +(but we will not actually use in this vignette). + +```{r} +library(monarchr) +library(tidygraph) +library(dplyr) +``` + + +# Engine `summary()` + +The summary function, when applied to a KG engine (like `file_engine()`, `neo4j_engine()`, +or the cloud-hosted `monarch_engine()`), prints counts of nodes and edges broken out by +available node category and edge predicate. To keep the information small, we'll produce a summary +of the included mini-KG containing information about Ehlers-Danlos Syndrome (EDS) and Marfan Syndrome: + +```{r} +filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr") +eds_marfan <- file_engine(filename) + +summary(eds_marfan) +``` + +The printout reports the number of nodes for each category, and the number of edges for each predicate. + +This information is also returned (invisibly) as a list; to suppress the printed output we can add `quiet = TRUE`. + +```{r} +s <- summary(eds_marfan, quiet = TRUE) + +paste("Total nodes:", s$total_nodes) +paste("Total edges:", s$total_edges) + +head(s$node_summary) +head(s$edge_summary) +``` + +Finally, the returned list also includes `cats` and `pred` entries, which are +named lists containing all available category and predicate labels for +convenient auto-completion in your favorite IDE. + +

+ +![](exploring_kgs_assets/autocomplete.png) + +

+ +The resulting auto-completion inserts the appropriate backtics in RStudio. + +```{r} +diseases <- eds_marfan |> + fetch_nodes(query_ids = "MONDO:0009159") |> + expand(predicates = s$preds$`biolink:has_phenotype`) + +diseases +``` + + +# Engine `example_graph()` + +Fetching a sample of data from a KG is another convenient way to explore its +contents, but a random sample is unlikely to illustrate the diversity of available +node categories, edge predicates, and information associated with nodes and +edges of different types. + +To serve this need `monarchr` provides an `example_graph()` function, which +fetches a sample of nodes and edges that are guaranteed to represent every +available category and every available predicate. When using this method, +it is important to remember that nodes frequently belong to multiple categories, +and the `pcategory` ("primary category") column represents one of a set of +chosen categories to represent the node. The choice of category shown in `pcategory` +is defined by `monarchr`, not the KG itself, and is configurable. + +```{r} +ex_g <- eds_marfan |> example_graph() +ex_g +``` + +Note that this method makes no other guarantees: the sample is not random, +the resulting graph may not be connected, the result is not the smallest possible +graph that contains all categories and predicates, and nodes and edges may not +contain complete data for all possible properties for their respective types. +Still, browsing the resulting graph in tabular form as above can quickly reveal +the bulk of information available in a KG for further targeted exploration +with `fetch_nodes()` and `expand()`. + + + + + + + + + + + + diff --git a/vignettes/examples/exploring_kgs_assets/autocomplete.png b/vignettes/examples/exploring_kgs_assets/autocomplete.png new file mode 100644 index 0000000..15568f8 Binary files /dev/null and b/vignettes/examples/exploring_kgs_assets/autocomplete.png differ