diff --git a/DESCRIPTION b/DESCRIPTION
index c8f4fea..96c0a9b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
Title: Monarch Knowledge Graph Queries
Description: R package for easy access, manipulation, and analysis of
Monarch KG data Resources.
-Version: 1.2.2
+Version: 1.4.0
URL: https://github.com/monarch-initiative/monarchr
BugReports: https://github.com/monarch-initiative/monarchr/issues
Authors@R:
diff --git a/NAMESPACE b/NAMESPACE
index fd78ef8..d2e357f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,8 @@
S3method(cypher_query,neo4j_engine)
S3method(cypher_query_df,neo4j_engine)
S3method(edges,tbl_kgx)
+S3method(example_graph,file_engine)
+S3method(example_graph,neo4j_engine)
S3method(expand,tbl_kgx)
S3method(explode,tbl_kgx)
S3method(fetch_nodes,file_engine)
@@ -12,11 +14,14 @@ S3method(knit_print,tbl_kgx)
S3method(nodes,tbl_kgx)
S3method(plot,tbl_kgx)
S3method(summarize_neighborhood,tbl_kgx)
+S3method(summary,file_engine)
+S3method(summary,neo4j_engine)
export("%in_list%")
export("%~%")
export(cypher_query)
export(cypher_query_df)
export(edges)
+export(example_graph)
export(expand)
export(explode)
export(fetch_nodes)
@@ -55,6 +60,7 @@ importFrom(kableExtra,column_spec)
importFrom(kableExtra,kable)
importFrom(kableExtra,kable_styling)
importFrom(neo2R,cypher)
+importFrom(neo2R,multicypher)
importFrom(neo2R,startGraph)
importFrom(purrr,map_chr)
importFrom(readr,col_character)
@@ -68,6 +74,7 @@ importFrom(stringr,str_wrap)
importFrom(tibble,tibble)
importFrom(tidygraph,activate)
importFrom(tidygraph,as_tibble)
+importFrom(tidygraph,graph_join)
importFrom(tidygraph,tbl_graph)
importFrom(utils,capture.output)
importFrom(utils,download.file)
diff --git a/NEWS.md b/NEWS.md
index e14b54e..03507fd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,21 @@
+# monarchr 1.4.0
+## New features
+
+* `example_graph()` function for engines
+* Engine `summary()` now returns named lists of available node categories and edge predicates for convenient auto-completion
+
+# monarchr 1.3.0
+
+## New features
+
+* `summary()` function for engines
+
+## Bugfixs
+
+* fix backend bug in Neo4j table queries not handling default params properly
+* add batch queries for Neo4j backend engine
+* added summary() for KG engines to summarize total node and edge count information
# monarchr 1.2.2
diff --git a/R/cypher_query.R b/R/cypher_query.R
index fbdaff6..f0b74fe 100644
--- a/R/cypher_query.R
+++ b/R/cypher_query.R
@@ -3,7 +3,7 @@
#' This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a tbl_kgx graph.
#'
#' @param engine A neo4j KG engine
-#' @param query A string representing the Cypher query.
+#' @param query A string representing the Cypher query. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a single joined graph.
#' @param parameters A list of parameters for the Cypher query. Default is an empty list.
#' @param ... Additional arguments passed to the function.
#' @return The result of the Cypher query as a tbl_kgx graph.
diff --git a/R/cypher_query.neo4j_engine.R b/R/cypher_query.neo4j_engine.R
index 5b42fb5..99f2091 100644
--- a/R/cypher_query.neo4j_engine.R
+++ b/R/cypher_query.neo4j_engine.R
@@ -27,13 +27,15 @@ stitch_vectors <- function(x) {
}
}
-########### Public functions ###########
-
-#' @export
-#' @importFrom neo2R cypher
-#' @importFrom tibble tibble
-cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { #
- res <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "graph")
+#' Process neo2R cypher to tbl_kgx
+#'
+#' Given a result from neo2R::cypher returning KGX-formatted nodes and edges,
+#' parse the result to generate a tbl_kgx object, attaching the provided engine.
+#'
+#' @param res The result from neo2R::cypher with result = "graph"
+#' @param engine The engine to attach to the returned graph
+#' @return A tbl_kgx
+neo2r_to_kgx <- function(res, engine) {
relationship_ids_contained <- as.integer(unlist(res$paths))
res <- stitch_vectors(res)
@@ -103,13 +105,13 @@ cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { #
# sapply!
# edges_df[[prop_name]] <- sapply(res$relationships, function(edge) {
edges_df[[prop_name]] <- sapply(res$relationships, function(edge) {
-# edge$properties[[prop_name]]
- prop_value <- edge$properties[[prop_name]]
- if(is.null(prop_value)) {
- return(NA)
- } else {
- return(prop_value)
- }
+ # edge$properties[[prop_name]]
+ prop_value <- edge$properties[[prop_name]]
+ if(is.null(prop_value)) {
+ return(NA)
+ } else {
+ return(prop_value)
+ }
})
}
@@ -121,3 +123,27 @@ cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { #
attr(g, "relationship_ids") <- relationship_ids_contained
return(g)
}
+
+########### Public functions ###########
+
+#' @export
+#' @importFrom neo2R cypher
+#' @importFrom neo2R multicypher
+#' @importFrom tibble tibble
+#' @importFrom tidygraph graph_join
+cypher_query.neo4j_engine <- function(engine, query, parameters = NULL, ...) { #
+ if(length(query) == 1) {
+ res <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "graph")
+ return(neo2r_to_kgx(res, engine = engine))
+ } else {
+ res <- neo2R::multicypher(engine$graph_conn, queries = query, parameters = parameters, result = "graph")
+ graphs <- lapply(res, neo2r_to_kgx, engine = engine)
+ g <- tbl_kgx(nodes = data.frame())
+ for(g2 in graphs) {
+ suppressMessages(g <- tidygraph::graph_join(g, g2), classes = "message") # suppress joining info
+ }
+ return(g)
+ }
+
+
+}
diff --git a/R/cypher_query_df.R b/R/cypher_query_df.R
index 033ff5b..07883aa 100644
--- a/R/cypher_query_df.R
+++ b/R/cypher_query_df.R
@@ -3,10 +3,10 @@
#' This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a data frame.
#'
#' @param engine A neo4j_engine() or derivative providing access to a Neo4j database.
-#' @param query A string representing the Cypher query, which should return a table.
-#' @param parameters A list of parameters for the Cypher query. Default is an empty list.
+#' @param query A string representing the Cypher query, which should return a table. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a list of data frames.
+#' @param parameters A list of parameters for the Cypher query, if required.
#' @param ... Additional arguments passed to the function.
-#' @return The result of the Cypher query as a data frame.
+#' @return The result of the Cypher query as a data frame, or a list of data frames if multiple queries are passed.
#' @export
#' @examplesIf monarch_engine_check()
#' engine <- monarch_engine()
@@ -17,6 +17,6 @@
#' result <- cypher_query_df(engine, query, parameters)
#' print(result)
#' @importFrom neo2R cypher
-cypher_query_df <- function(engine, query, parameters = list(), ...) {
+cypher_query_df <- function(engine, query, parameters = NULL, ...) {
UseMethod("cypher_query_df")
}
diff --git a/R/cypher_query_df.neo4j_engine.R b/R/cypher_query_df.neo4j_engine.R
index 0be86b7..d696a76 100644
--- a/R/cypher_query_df.neo4j_engine.R
+++ b/R/cypher_query_df.neo4j_engine.R
@@ -1,7 +1,12 @@
#' @export
#' @importFrom neo2R cypher
-cypher_query_df.neo4j_engine <- function(engine, query, parameters = list(), ...) {
+#' @importFrom neo2R multicypher
+cypher_query_df.neo4j_engine <- function(engine, query, parameters = NULL, ...) {
+ if(length(query) == 1) {
result <- neo2R::cypher(engine$graph_conn, query = query, parameters = parameters, result = "row", arraysAsStrings = FALSE)
+ } else {
+ result <- neo2R::multicypher(engine$graph_conn, queries = query, parameters = parameters, result = "row", arraysAsStrings = FALSE)
+ }
- return(result)
+ return(result)
}
diff --git a/R/example_graph.R b/R/example_graph.R
new file mode 100644
index 0000000..25a2833
--- /dev/null
+++ b/R/example_graph.R
@@ -0,0 +1,31 @@
+#' Return an example set of nodes from a KG engine.
+#'
+#' Given a KG engine, returns a graph representing the diversity
+#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
+#' contain at least one node of every category, and at least one edge of every
+#' predicate. No other guarantees are made: the example graph is not minimal
+#' to satisfy these criteria, it is not random or even pseudo-random, and it
+#' may not be connected.
+#'
+#' @param engine A KG engine object
+#' @param ... Other parameters (not used)
+#' @return A tbl_kgx graph
+#' @export
+#' @examples
+#' # Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+#'
+#' # prints a readable summary and returns a list of dataframes
+#' g <- file_engine(filename) |> example_graph()
+#' print(g)
+#'
+#' @examplesIf monarch_engine_check()
+#' # prints a readable summary and returns a list of dataframes
+#' g <- monarch_engine() |> example_graph()
+#' print(g)
+#' @import tidygraph
+#' @import dplyr
+example_graph <- function(engine, ...) {
+ UseMethod("example_graph")
+}
+
diff --git a/R/example_graph.file_engine.R b/R/example_graph.file_engine.R
new file mode 100644
index 0000000..40e723b
--- /dev/null
+++ b/R/example_graph.file_engine.R
@@ -0,0 +1,99 @@
+#' Return an example set of nodes from a KG engine.
+#'
+#' Given a KGX file-based KG engine, returns a graph representing the diversity
+#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
+#' contain at least one node of every category, and at least one edge of every
+#' predicate. No other guarantees are made: the example graph is not minimal
+#' to satisfy these criteria, it is not random or even pseudo-random, and it
+#' may not be connected.
+#'
+#' @param engine A `file_engine` object
+#' @param ... Other parameters (not used)
+#' @return A tbl_kgx graph
+#' @export
+#' @examples
+#' # Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+#'
+#' # Retrieve and print an example graph:
+#' g <- file_engine(filename) |> example_graph()
+#' print(g)
+#' @import tidygraph
+#' @import dplyr
+example_graph.file_engine <- function(engine, ...) {
+ # first, let's discover the different edge types (predicates) available
+ edges_df <- engine$graph |>
+ activate(edges) |>
+ as.data.frame()
+
+ nodes_df <- engine$graph |>
+ activate(nodes) |>
+ as.data.frame()
+
+
+ pred_types <- edges_df |>
+ pull(predicate) |>
+ unique()
+
+ # next we get a bunch of edges of the different predicate types as a graph
+ sample_edges <- edges_df |>
+ group_by(predicate) |>
+ slice(1) |>
+ ungroup() |>
+ select(-to, -from)
+
+ sample_nodes <- nodes_df |>
+ filter(id %in% sample_edges$subject | id %in% sample_edges$object)
+
+ sample_preds_graph <- tbl_kgx(nodes = sample_nodes, edges = sample_edges, attach_engine = engine)
+
+ # this might not represent all categories however.
+ # So we compute the categories that are represented thus far
+ used_categories <- sample_preds_graph |>
+ activate(nodes) |>
+ as.data.frame() |>
+ pull(category) |>
+ unlist() |>
+ unique()
+
+ # get the available categories
+ all_node_categories <- nodes_df |>
+ pull(category) |>
+ unlist() |>
+ unique()
+
+
+ # compute the node categories that are still needed
+ needed_categories <- setdiff(all_node_categories, used_categories)
+
+ # now sample nodes of those categories, and an arbitrary connection
+ # trouble is, nodes_df$category is a list column...
+ sample_cats_node_ids <- needed_categories |> lapply(function(cat) {
+ has_cat_rows <- which(cat %in_list% nodes_df$category)
+ return(nodes_df$id[has_cat_rows[1]])
+ # unique because a single node may be selected to represent multiple needed categories
+ }) |> unlist() |> unique()
+
+ # ok, we have ids that cover the needed categories. Let's grab one row from
+ # the edges table for each (id could be subject or object)
+ sample_cats_edges_list <- sample_cats_node_ids |> lapply(function(node_id) {
+ row <- edges_df |>
+ filter(node_id == subject | node_id == object) |>
+ head(n = 1) |>
+ select(-to, -from)
+ })
+ sample_cats_edges <- do.call(rbind, sample_cats_edges_list)
+
+ # now we need to select the corresponding nodes via their ids
+ sample_cats_all_ids <- c(sample_cats_edges$subject, sample_cats_edges$object) |>
+ unique()
+
+ sample_cats_nodes <- nodes_df |>
+ filter(id %in% sample_cats_all_ids)
+
+ # and join it all up
+ sample_cats_graph <- tbl_kgx(nodes = sample_cats_nodes, edges = sample_cats_edges, attach_engine = engine, )
+
+ suppressMessages(all <- kg_join(sample_cats_graph, sample_preds_graph), classes = "message")
+ return(all)
+}
diff --git a/R/example_graph.neo4j_engine.R b/R/example_graph.neo4j_engine.R
new file mode 100644
index 0000000..5a7b6f5
--- /dev/null
+++ b/R/example_graph.neo4j_engine.R
@@ -0,0 +1,56 @@
+#' Return an example set of nodes from a KG engine.
+#'
+#' Given a KGX Neo4j KG engine, returns a graph representing the diversity
+#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
+#' contain at least one node of every category, and at least one edge of every
+#' predicate. No other guarantees are made: the example graph is not minimal
+#' to satisfy these criteria, it is not random or even pseudo-random, and it
+#' may not be connected.
+#'
+#' @param engine A `neo4j_engine` object
+#' @param ... Other parameters (not used)
+#' @return A tbl_kgx graph
+#' @export
+#' @examplesIf monarch_engine_check()
+#' # Retrieve and print an example graph:
+#' g <- monarch_engine() |> example_graph()
+#' print(g)
+#' @import tidygraph
+#' @import dplyr
+example_graph.neo4j_engine <- function(engine, ...) {
+ # first, let's discover the different edge types (predicates) available from the schema info
+ pred_types_query <- "CALL db.schema.visualization() YIELD relationships
+ UNWIND relationships AS rel
+ RETURN DISTINCT type(rel) AS predicate"
+ pred_types <- cypher_query_df(engine, pred_types_query)
+
+ # next we get a bunch of edges of the different predicate types as a graph
+ sample_preds_query <- paste0("MATCH (a)-[r:`", pred_types$predicate, "`]->(b) RETURN a, b, r LIMIT 1")
+ sample_preds_graph <- cypher_query(engine, query = sample_preds_query)
+
+ # this might not represent all categories however.
+
+ # So we compute the categories that are represented thus far
+ used_categories <- sample_preds_graph |>
+ activate(nodes) |>
+ as.data.frame() |>
+ pull(category) |>
+ unlist() |>
+ unique()
+
+ # get the available categories from the schema
+ categories_query <- "CALL db.labels() YIELD label RETURN DISTINCT label"
+ all_node_categories <- cypher_query_df(engine, categories_query)$label
+
+ # compute the node categories that are still needed
+ needed_categories <- setdiff(all_node_categories, used_categories)
+
+ # now sample nodes of those categories, and an arbitrary connection
+ sample_cats_query <- paste0("MATCH (a:`", needed_categories, "`) -[r]- (b) RETURN a, r, b LIMIT 1")
+ sample_new_cats <- cypher_query(engine, query = sample_cats_query)
+
+ # finally, we join the two and return
+ suppressMessages(full_sample <- kg_join(sample_preds_graph, sample_new_cats), classes = "message")
+
+ return(full_sample)
+}
diff --git a/R/summary.file_engine.R b/R/summary.file_engine.R
new file mode 100644
index 0000000..4124e9b
--- /dev/null
+++ b/R/summary.file_engine.R
@@ -0,0 +1,92 @@
+#' Summarize contents of a KGX-file-based KG engine
+#'
+#' Given a KGX file-based KG engine, provides summary information in the form of
+#' node counts, category counts across nodes, and relationship type counts.
+#' General information about the graph is printed to the console, and a list of
+#' dataframes describing node and edge counts is returned invisibly. Also returned
+#' are `cats` and `preds` entries, containing lists of available node categories and
+#' edge predicates, respectively, for convenient auto-completion in RStudio.
+#'
+#' @param engine A `file_engine` object
+#' @param ... Other parameters (not used)
+#' @param quiet Logical, whether to suppress printing of the summary
+#' @return A list of dataframes and named lists
+#' @export
+#' @examples
+#' # Using example KGX file packaged with monarchr
+#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+
+#' # prints a readable summary and returns a list of dataframes
+#' res <- file_engine(filename) |> summary()
+#' print(res)
+#' @import tidygraph
+#' @import dplyr
+summary.file_engine <- function(engine, ..., quiet = FALSE) {
+ if(!quiet) {
+ cat("\n")
+ cat("A KGX file-backed knowledge graph engine.\n")
+ }
+
+ g <- engine$graph
+
+ total_nodes <- g |>
+ activate(nodes) |>
+ as.data.frame() |>
+ nrow()
+
+ total_edges <- g |>
+ activate(edges) |>
+ as.data.frame() |>
+ nrow()
+
+ all_node_cats <- g |>
+ activate(nodes) |>
+ as.data.frame() |>
+ pull(category) |>
+ unlist() |>
+ sort() |>
+ rle()
+
+ node_summary_df <- data.frame(category = all_node_cats$values,
+ count = all_node_cats$lengths) |>
+ arrange(desc(count))
+
+ all_edge_predicates <- g |>
+ activate(edges) |>
+ as.data.frame() |>
+ pull(predicate) |>
+ sort() |>
+ rle()
+
+ edge_summary_df <- data.frame(predicate = all_edge_predicates$values,
+ count = all_edge_predicates$lengths) |>
+ arrange(desc(count))
+
+
+ if(!quiet) {
+ cat("Total nodes: ", total_nodes, "\n")
+ cat("Total edges: ", total_edges, "\n")
+ cat("\n")
+ cat("Node category counts:\n")
+ # print the data frame without row names
+ print(node_summary_df, row.names = FALSE)
+ cat("\n")
+ cat("Edge type counts:\n")
+ # print the data frame without row names
+ print(edge_summary_df, row.names = FALSE)
+ }
+
+
+ cats <- as.list(node_summary_df$category)
+ names(cats) <- cats
+
+ preds <- as.list(edge_summary_df$predicate)
+ names(preds) <- preds
+
+ return(invisible(list(node_summary = node_summary_df,
+ edge_summary = edge_summary_df,
+ total_nodes = total_nodes,
+ total_edges = total_edges,
+ cats = cats,
+ preds = preds)))
+}
diff --git a/R/summary.neo4j_engine.R b/R/summary.neo4j_engine.R
new file mode 100644
index 0000000..dd9b9b4
--- /dev/null
+++ b/R/summary.neo4j_engine.R
@@ -0,0 +1,76 @@
+#' Summarize contents of a Neo4j KG engine
+#'
+#' Given a Neo4j based KG engine, provides summary information in the form of
+#' node counts, category counts across nodes, and relationship type counts.
+#' General information about the graph is printed to the console, and a list of
+#' dataframes describing node and edge counts is returned invisibly. Also returned
+#' are `cats` and `preds` entries, containing lists of available node categories and
+#' edge predicates, respectively, for convenient auto-completion in RStudio.
+#'
+#' @param engine A `neo4j_engine` object
+#' @param ... Other parameters (not used)
+#' @param quiet Logical, whether to suppress printing of the summary
+#' @return A list of dataframes and named lists
+#' @export
+#' @examplesIf monarch_engine_check()
+#' # prints a readable summary and returns a list of dataframes
+#' stats <- monarch_engine() |> summary()
+#' print(stats)
+#'
+summary.neo4j_engine <- function(engine, ..., quiet = FALSE) {
+ if(!quiet) {
+ cat("\n")
+ cat("A Neo4j-backed knowledge graph engine.\n")
+ cat("Gathering statistics, please wait...\n")
+ }
+
+ # possible optimization: use a schema query to get different available categories,
+ # count them individually:
+ # cat_counts_query <- paste0("MATCH (a:`", all_node_categories, "`) WITH count(*) as count, '", all_node_categories ,"' as category RETURN category, count")
+ # cat_counts <- cypher_query_df(e, cat_counts_query)
+ # cat_counts_df <- do.call(rbind, cat_counts) |> arrange(desc(count))
+
+
+ node_summary_df <- cypher_query_df(engine, "MATCH (n) UNWIND labels(n) AS category WITH category, COUNT(n) AS count RETURN category, count ORDER BY count DESC")
+ edge_summary_df <- cypher_query_df(engine, "MATCH ()-[r]->() RETURN type(r) AS predicate, COUNT(*) AS count ORDER BY count DESC")
+
+ counts_query <- "
+ // Count the total number of nodes
+ MATCH (n)
+ RETURN 'nodes_total' AS Type, COUNT(n) AS Count
+ UNION
+ // Count the total number of edges
+ MATCH ()-[r]->()
+ RETURN 'edges_total' AS Type, COUNT(r) AS Count
+ "
+
+ total_df <- cypher_query_df(engine, counts_query)
+ total_nodes <- total_df$Count[1]
+ total_edges <- total_df$Count[2]
+
+ if(!quiet) {
+ cat("Total nodes: ", total_nodes, "\n")
+ cat("Total edges: ", total_edges, "\n")
+ cat("\n")
+ cat("Node category counts:\n")
+ # print the data frame without row names
+ print(node_summary_df, row.names = FALSE)
+ cat("\n")
+ cat("Edge type counts:\n")
+ # print the data frame without row names
+ print(edge_summary_df, row.names = FALSE)
+ }
+
+ cats <- as.list(node_summary_df$category)
+ names(cats) <- cats
+
+ preds <- as.list(edge_summary_df$predicate)
+ names(preds) <- preds
+
+ return(invisible(list(node_summary = node_summary_df,
+ edge_summary = edge_summary_df,
+ total_nodes = total_nodes,
+ total_edges = total_edges,
+ cats = cats,
+ preds = preds)))
+}
diff --git a/man/cypher_query.Rd b/man/cypher_query.Rd
index bde8781..0e2b3c4 100644
--- a/man/cypher_query.Rd
+++ b/man/cypher_query.Rd
@@ -9,7 +9,7 @@ cypher_query(engine, query, parameters = NULL, ...)
\arguments{
\item{engine}{A neo4j KG engine}
-\item{query}{A string representing the Cypher query.}
+\item{query}{A string representing the Cypher query. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a single joined graph.}
\item{parameters}{A list of parameters for the Cypher query. Default is an empty list.}
diff --git a/man/cypher_query_df.Rd b/man/cypher_query_df.Rd
index 36762a5..b8f90e1 100644
--- a/man/cypher_query_df.Rd
+++ b/man/cypher_query_df.Rd
@@ -4,19 +4,19 @@
\alias{cypher_query_df}
\title{Execute a Cypher Query}
\usage{
-cypher_query_df(engine, query, parameters = list(), ...)
+cypher_query_df(engine, query, parameters = NULL, ...)
}
\arguments{
\item{engine}{A neo4j_engine() or derivative providing access to a Neo4j database.}
-\item{query}{A string representing the Cypher query, which should return a table.}
+\item{query}{A string representing the Cypher query, which should return a table. Multiple queries may be passed as a vector; if so, Neo2R::multicypher if used and the result is returned as a list of data frames.}
-\item{parameters}{A list of parameters for the Cypher query. Default is an empty list.}
+\item{parameters}{A list of parameters for the Cypher query, if required.}
\item{...}{Additional arguments passed to the function.}
}
\value{
-The result of the Cypher query as a data frame.
+The result of the Cypher query as a data frame, or a list of data frames if multiple queries are passed.
}
\description{
This function takes a Cypher query and parameters, executes the query using the given engine, and returns the result as a data frame.
diff --git a/man/example_graph.Rd b/man/example_graph.Rd
new file mode 100644
index 0000000..5c38678
--- /dev/null
+++ b/man/example_graph.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/example_graph.R
+\name{example_graph}
+\alias{example_graph}
+\title{Return an example set of nodes from a KG engine.}
+\usage{
+example_graph(engine, ...)
+}
+\arguments{
+\item{engine}{A KG engine object}
+
+\item{...}{Other parameters (not used)}
+}
+\value{
+A tbl_kgx graph
+}
+\description{
+Given a KG engine, returns a graph representing the diversity
+of node categories and edge predicates for browsing. The returned graph is guaranteed to
+contain at least one node of every category, and at least one edge of every
+predicate. No other guarantees are made: the example graph is not minimal
+to satisfy these criteria, it is not random or even pseudo-random, and it
+may not be connected.
+}
+\examples{
+# Using example KGX file packaged with monarchr
+filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+
+# prints a readable summary and returns a list of dataframes
+g <- file_engine(filename) |> example_graph()
+print(g)
+
+\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# prints a readable summary and returns a list of dataframes
+g <- monarch_engine() |> example_graph()
+print(g)
+\dontshow{\}) # examplesIf}
+}
diff --git a/man/example_graph.file_engine.Rd b/man/example_graph.file_engine.Rd
new file mode 100644
index 0000000..c72c792
--- /dev/null
+++ b/man/example_graph.file_engine.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/example_graph.file_engine.R
+\name{example_graph.file_engine}
+\alias{example_graph.file_engine}
+\title{Return an example set of nodes from a KG engine.}
+\usage{
+\method{example_graph}{file_engine}(engine, ...)
+}
+\arguments{
+\item{engine}{A \code{file_engine} object}
+
+\item{...}{Other parameters (not used)}
+}
+\value{
+A tbl_kgx graph
+}
+\description{
+Given a KGX file-based KG engine, returns a graph representing the diversity
+of node categories and edge predicates for browsing. The returned graph is guaranteed to
+contain at least one node of every category, and at least one edge of every
+predicate. No other guarantees are made: the example graph is not minimal
+to satisfy these criteria, it is not random or even pseudo-random, and it
+may not be connected.
+}
+\examples{
+# Using example KGX file packaged with monarchr
+filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+
+# Retrieve and print an example graph:
+g <- file_engine(filename) |> example_graph()
+print(g)
+}
diff --git a/man/example_graph.neo4j_engine.Rd b/man/example_graph.neo4j_engine.Rd
new file mode 100644
index 0000000..ee758cb
--- /dev/null
+++ b/man/example_graph.neo4j_engine.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/example_graph.neo4j_engine.R
+\name{example_graph.neo4j_engine}
+\alias{example_graph.neo4j_engine}
+\title{Return an example set of nodes from a KG engine.}
+\usage{
+\method{example_graph}{neo4j_engine}(engine, ...)
+}
+\arguments{
+\item{engine}{A \code{neo4j_engine} object}
+
+\item{...}{Other parameters (not used)}
+}
+\value{
+A tbl_kgx graph
+}
+\description{
+Given a KGX Neo4j KG engine, returns a graph representing the diversity
+of node categories and edge predicates for browsing. The returned graph is guaranteed to
+contain at least one node of every category, and at least one edge of every
+predicate. No other guarantees are made: the example graph is not minimal
+to satisfy these criteria, it is not random or even pseudo-random, and it
+may not be connected.
+}
+\examples{
+\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Retrieve and print an example graph:
+g <- monarch_engine() |> example_graph()
+print(g)
+\dontshow{\}) # examplesIf}
+}
diff --git a/man/neo2r_to_kgx.Rd b/man/neo2r_to_kgx.Rd
new file mode 100644
index 0000000..b46dc45
--- /dev/null
+++ b/man/neo2r_to_kgx.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cypher_query.neo4j_engine.R
+\name{neo2r_to_kgx}
+\alias{neo2r_to_kgx}
+\title{Process neo2R cypher to tbl_kgx}
+\usage{
+neo2r_to_kgx(res, engine)
+}
+\arguments{
+\item{res}{The result from neo2R::cypher with result = "graph"}
+
+\item{engine}{The engine to attach to the returned graph}
+}
+\value{
+A tbl_kgx
+}
+\description{
+Given a result from neo2R::cypher returning KGX-formatted nodes and edges,
+parse the result to generate a tbl_kgx object, attaching the provided engine.
+}
diff --git a/man/summary.file_engine.Rd b/man/summary.file_engine.Rd
new file mode 100644
index 0000000..3742787
--- /dev/null
+++ b/man/summary.file_engine.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/summary.file_engine.R
+\name{summary.file_engine}
+\alias{summary.file_engine}
+\title{Summarize contents of a KGX-file-based KG engine}
+\usage{
+\method{summary}{file_engine}(engine, ..., quiet = FALSE)
+}
+\arguments{
+\item{engine}{A \code{file_engine} object}
+
+\item{...}{Other parameters (not used)}
+
+\item{quiet}{Logical, whether to suppress printing of the summary}
+}
+\value{
+A list of dataframes
+}
+\description{
+Given a KGX file-based KG engine, provides summary information in the form of
+node counts, category counts across nodes, and relationship type counts.
+General information about the graph is printed to the console, and a list of
+dataframes describing node and edge counts is returned invisibly.
+}
+\examples{
+# Using example KGX file packaged with monarchr
+filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+# prints a readable summary and returns a list of dataframes
+res <- file_engine(filename) |> summary()
+print(res)
+}
diff --git a/man/summary.neo4j_engine.Rd b/man/summary.neo4j_engine.Rd
new file mode 100644
index 0000000..924161c
--- /dev/null
+++ b/man/summary.neo4j_engine.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/summary.neo4j_engine.R
+\name{summary.neo4j_engine}
+\alias{summary.neo4j_engine}
+\title{Summarize contents of a Neo4j KG engine}
+\usage{
+\method{summary}{neo4j_engine}(engine, ..., quiet = FALSE)
+}
+\arguments{
+\item{engine}{A \code{neo4j_engine} object}
+
+\item{...}{Other parameters (not used)}
+
+\item{quiet}{Logical, whether to suppress printing of the summary}
+}
+\value{
+A list of dataframes
+}
+\description{
+Given a Neo4j based KG engine, provides summary information in the form of
+node counts, category counts across nodes, and relationship type counts.
+General information about the graph is printed to the console, and a list of
+dataframes describing node and edge counts is returned invisibly.
+}
+\examples{
+\dontshow{if (monarch_engine_check()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# prints a readable summary and returns a list of dataframes
+stats <- monarch_engine() |> summary()
+print(stats)
+\dontshow{\}) # examplesIf}
+}
diff --git a/meta/dev_tests.R b/meta/dev_tests.R
index 9d55e5b..b0d2adf 100644
--- a/meta/dev_tests.R
+++ b/meta/dev_tests.R
@@ -1,96 +1,161 @@
+devtools::load_all()
+f <- file_engine(system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr"))
+gs <- f |>
+ fetch_nodes(name %~% "Marfan") |>
+ expand()
+g <- gs |>
+ expand()
-The `expand` function will fetch edges edges between nodes in the query as well; for example, if we want to see if any of the top 5 hits for Fanconi anemia are directly connected, we can begin by getting all of the edges associated with them, storing the result in a separate graph, and taking the intersection of their node sets to filter back down to keep just the original nodes but all the edges between them.
+# The `expand` function will fetch edges edges between nodes in the query as well; for example, if we want to see if any of the top 5 hits for Fanconi anemia are directly connected, we can begin by getting all of the edges associated with them, storing the result in a separate graph, and taking the intersection of their node sets to filter back down to keep just the original nodes but all the edges between them.
-```{r}
-g <- monarch_search("Ehlers-danlos syndrome", limit = 5)
+# g <- monarch_search("Ehlers-danlos syndrome", limit = 5)
-expanded <- g |>
- expand()
+# expanded <- g |>
+# expand()
-joined <- expanded |>
- activate(nodes) |>
- inner_join(nodes(g))
+# joined <- expanded |>
+# activate(nodes) |>
+# inner_join(nodes(g))
-joined
-```
+# joined
-Let's do something more complicated: let's fetch all the genes related to FA and EDS, and all the phenotypes associated with either of those diseases directly or via those genes.
+# Let's do something more complicated: let's fetch all the genes related to FA and EDS, and all the phenotypes associated with either of those diseases directly or via those genes.
-```{r}
-fa <- monarch_search("Fanconi anemia", limit = 1) %>%
- expand(result_)
-```
+# fa <- monarch_search("Fanconi anemia", limit = 1) %>%
+# expand(result_)
-Let's visualize that:
-```{r fig.height=5, fig.width=7}
-ggraph(joined, layout = "fr") +
- geom_edge_link() +
- geom_node_point(aes(color = pcategory, label = wrap(name))) +
- theme_graph() +
- theme(legend.position = 'bottom')
+# Let's visualize that:
-```
+# ggraph(joined, layout = "fr") +
+# geom_edge_link() +
+# geom_node_point(aes(color = pcategory, label = wrap(name))) +
+# theme_graph() +
+# theme(legend.position = 'bottom')
-Let's visualize that:
- ```{r eval=FALSE}
-joined <- joined |>
- mutate(tooltip = paste0(name, "\n\n", paste(strwrap(description, 80), collapse = "\n"))) |>
- mutate(tooltip = paste0(name, "\n\n", pcategory, "\n", iri ))
+# # Let's visualize that:
+# joined <- joined |>
+# mutate(tooltip = paste0(name, "\n\n", paste(strwrap(description, 80), collapse = "\n"))) |>
+# mutate(tooltip = paste0(name, "\n\n", pcategory, "\n", iri ))
+#
# Load required libraries
library(digest)
library(RColorBrewer)
-
-# Function to map factor levels or character vector to colors
-color_cats <- function(factors) {
+library(dplyr)
+library(tidygraph)
+
+# Generates a palette with num_colors entries, mapping
+# inputs pseudorandomly to them. if levels_only = FALSE,
+# a vector of RGB values the same length as input is returned.
+# if levels_only = TRUE, a named vector is returned mapping input
+# levels to RGB values
+color_cats <- function(input, num_colors = 16, levels_only = TRUE) {
# Ensure the input is treated as a factor
- factors <- factor(factors)
+ factors <- factor(input)
- # Choose a color palette (e.g., Set3) with a reasonably large number of distinct colors
- palette_name <- "Set3"
- palette <- brewer.pal(name = palette_name, n = min(length(unique(factors)), brewer.pal.info[palette_name, "max"]))
+ palette <- grDevices::hcl.colors(num_colors, palette = "Set3")
- # Hash function to convert factor levels to numeric values consistently
- hashes <- lapply(levels(factors), function(x) digest(x, algo = "crc32", serialize = FALSE))
+ # Hash function to convert factor levels to numeric values consistently
+ hashes <- lapply(levels(factors), function(x) digest::digest(x, algo = "crc32", serialize = FALSE))
hash_integers <- sapply(hashes, function(x) strtoi(substr(x, 1, 5), base=16))
# Map hashes to indices in the color palette
# Use modulo to wrap around if there are more factors than colors
color_indices <- (hash_integers %% length(palette)) + 1
- color_map <- setNames(palette[color_indices], levels(factors))
- # Return the colors corresponding to the input factors
- return(color_map[as.character(factors)])
+ color_map <- setNames(palette[color_indices], levels(factors))
+ if(levels_only) {
+ return(color_map)
+ } else {
+ # Return the colors corresponding to the input
+ return(unname(color_map[as.character(input)]))
+ }
+}
+# g |> nodes() |> pull(pcategory) |> factor() |> levels() |> head(n = 4) |> color_cats()
+#
+#
+# # # Example usage
+# # factors <- c("Apple", "Banana", "Cherry", "Date", "Apple", "Banana")
+# # colors <- map_factors_to_colors(factors)
+#
+# # # Print the mapping
+# # print(colors)
+#
+#
+#
+# # jexp <- joined |>
+# # expand(categories = "biolink:PhenotypicFeature")
+#
+#
+# library(visNetwork)
+# visNetwork(nodes(g) %>%
+# mutate(kg_id = id) %>%
+# mutate(id = 1:nrow(nodes(g))) %>%
+# mutate(color = pal),
+# edges(g)) %>%
+# visEdges(shadow = TRUE)
+#
+#
+# program <- "document.addEventListener('mousemove', function(e) {
+# let x = document.getElementsByClassName('infobox')[3];
+# x.style['background'] = '#222222';
+# x.style['border-radius'] = '5px';
+# x.style['color'] = '#222';
+# x.style['font-family'] = 'sans-serif';
+# x.style['position'] = 'absolute';
+# x.style['top'] = e.pageY + 'px';
+# x.style['left'] = e.pageX + 'px';
+# })"
+#
+#
+# library(rthreejs)
+# graphjs(gs,
+# vertex.color = as.character(pal),
+# vertex.size = 0.5, vertex.label = nodes(g)$name)
+#
+#
+# library(RedeR)
+# startRedeR()
+# addGraphToRedeR(g=gs)
+
+
+cytoscape <- function(g) {
+ RCy3::cytoscapePing()
+
+ nodes_df <- nodes(g)
+ nodes_df$desc_wrapped <- stringr::str_wrap(nodes_df$description, 50)
+
+ edges_df <- edges(g)
+
+ RCy3::createNetworkFromDataFrames(nodes_df,
+ edges_df,
+ title = "KG Nodes",
+ collection = "monarchr Graphs",
+ source.id.list = 'subject',
+ target.id.list = 'object')
+ RCy3::layoutNetwork('kamada-kawai')
+
+ pal <- color_cats(nodes(g)$pcategory, levels_only = TRUE)
+ RCy3::setNodeColorMapping('pcategory', table.column.values = names(pal), colors = pal, mapping.type = 'd')
+
+ RCy3::setNodeTooltipMapping(table.column = 'desc_wrapped')
}
-# Example usage
-factors <- c("Apple", "Banana", "Cherry", "Date", "Apple", "Banana")
-colors <- map_factors_to_colors(factors)
-# Print the mapping
-print(colors)
+cytoscape(g)
-jexp <- joined |>
- expand(categories = "biolink:PhenotypicFeature")
+library(ggraph)
+library(ggiraph)
-library(visNetwork)
-visNetwork(nodes(joined) #%>%
- #mutate(kg_id = id)# %>%
- #mutate(id = 1:nrow(nodes(jexp))) #%>%
- # mutate(color = color_cats(pcategory))
- ,
- edges(joined)) %>%
- visEdges(shadow = TRUE)
-
-z <- ggraph(joined, layout = 'fr') + # fr
+z <- ggraph(g, layout = 'fr') + # fr
geom_edge_link(aes(alpha = after_stat(index),
color = predicate)) +
# geom_node_point(aes(color = pcategory)) +
@@ -99,7 +164,7 @@ z <- ggraph(joined, layout = 'fr') + # fr
y = y,
color = pcategory,
data_id = id,
- tooltip = tooltip)
+ tooltip = name)
) +
scale_edge_alpha('Edge direction', guide = 'edge_direction') +
theme_graph() +
@@ -111,80 +176,24 @@ girafe(ggobj = z, height_svg = 3, width_svg = 5,
opts_tooltip(css = "font-family: sans-serif; background: #333333; padding: 10px; font-size: small")
))
-str(z)
-```
-
-```{r eval=FALSE}
-subtypes <- g %>%
- expand(direction = "in", predicates = "biolink:subclass_of", transitive = TRUE)
-
-with_phenos <- subtypes %>%
- expand(predicates = "biolink:has_phenotype")
-
-# print(with_phenos)
-
-with_genes <- subtypes %>%
- expand(categories = "biolink:Gene")
-
-# print(with_genes)
-
-neighbors <- graph_join(with_phenos, with_genes)
-neighbors
-```
-
-```{r eval=FALSE}
-library(ggraph)
-
-neighbors <- neighbors %>%
- activate(nodes)
-
-p <- ggraph(neighbors, layout = 'fr') +
- geom_edge_link() +
- geom_node_point(aes(color = pcategory)) +
- theme(legend.position = 'bottom')
-
-plot(p)
-```
+# str(z)
+#
+# subtypes <- g %>%
+# expand(direction = "in", predicates = "biolink:subclass_of", transitive = TRUE)
+#
+# with_phenos <- subtypes %>%
+# expand(predicates = "biolink:has_phenotype")
+#
+# # print(with_phenos)
+#
+# with_genes <- subtypes %>%
+# expand(categories = "biolink:Gene")
+#
+# # print(with_genes)
+#
+# neighbors <- graph_join(with_phenos, with_genes)
+# neighbors
-```{r eval=FALSE}
-library(ggraph)
-
-eds_phenos <- monarch_search("Ehlers-danlos syndrome", limit = 1) %>%
- expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>%
- expand(categories = "biolink:PhenotypicFeature")
-
-# eds_phenos
-
-fanconi_phenos <- monarch_search("Fanconi anemia", limit = 1) %>%
- expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>%
- expand(categories = "biolink:PhenotypicFeature")
-
-# fanconi_phenos
-
-inner <- inner_join(activate(eds_phenos, nodes), activate(fanconi_phenos, nodes) %>% as_tibble())
-print(inner)
-
-p <- ggraph(inner, layout = 'fr') +
- geom_edge_link() +
- geom_node_point(aes(color = pcategory)) +
- theme(legend.position = 'bottom')
-
-plot(p)
-
-both <- graph_join(eds_phenos, fanconi_phenos)
-
-
-p <- ggraph(both, layout = 'fr') +
- geom_edge_link() +
- geom_node_point(aes(color = pcategory)) +
- theme(legend.position = 'bottom')
-
-
-plot(p)
-
-```
-
-```{r eval=FALSE}
library(visNetwork)
library(igraph)
library(ggiraph)
@@ -192,13 +201,13 @@ library(ggraph)
# x <- visNetwork(g %N>% as_tibble(), g %E>% as_tibble(), height = "400px") %>%
# visNodes(color = list(background = ""))
-g <- monarch_search("Fanconi anemia", limit = 1) %>%
- expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>%
- expand(categories = "biolink:PhenotypicFeature")
-
-g <- g %>%
- activate(nodes) %>%
- mutate(tooltip = paste0(name, "\n", description))
+# g <- monarch_search("Fanconi anemia", limit = 1) %>%
+# expand(predicates = "biolink:subclass_of", direction = "in", transitive = TRUE) %>%
+# expand(categories = "biolink:PhenotypicFeature")
+#
+# g <- g %>%
+# activate(nodes) %>%
+# mutate(tooltip = paste0(name, "\n", description))
z <- ggraph(g, layout = 'fr') + # fr
# geom_edge_link(aes(alpha = after_stat(index),
@@ -211,7 +220,7 @@ z <- ggraph(g, layout = 'fr') + # fr
y = y,
color = pcategory,
data_id = id,
- tooltip = tooltip)
+ tooltip = name)
) +
scale_edge_alpha('Edge direction', guide = 'edge_direction') +
theme_graph() +
@@ -223,4 +232,139 @@ girafe(ggobj = z, width_svg = 5, height_svg = 5,
opts_zoom(max = 5)
))
-```
+
+
+#############################
+
+devtools::load_all()
+# counts by sets - formatted for use with UpSetR
+res <- monarch_engine() |>
+ cypher_query_df("MATCH (n) WITH labels(n) AS LabelList, COUNT(n) AS Count WITH LabelList, Count, REDUCE(s = HEAD(LabelList), x IN TAIL(LabelList) | s + '&' + x) AS LabelCombination RETURN LabelCombination, Count", parameters = list(nodes = list("NCBIGene:2876645")))
+
+res
+
+# counts by sets - catories returned a list col
+res <- monarch_engine() |>
+ cypher_query_df("MATCH (n) WITH labels(n) AS LabelList, COUNT(n) AS Count RETURN LabelList, Count")
+
+res
+
+
+input <- res$Count
+names(input) <- res$LabelCombination
+
+library(UpSetR)
+
+# Create the UpSet plot
+upset(fromExpression(input),
+ order.by = "freq",
+ nsets = 20,
+ nintersects = NA)
+
+
+# node category counts
+cat_counts <- monarch_engine() |>
+ cypher_query_df("MATCH (n) UNWIND labels(n) AS Label WITH Label, COUNT(n) AS Count RETURN Label, Count ORDER BY Count DESC")
+
+cat_counts
+
+# relationship type counts
+
+pred_counts <- monarch_engine() |>
+ cypher_query_df("MATCH ()-[r]->() RETURN type(r) AS RelationshipType, COUNT(*) AS RelationshipCount ORDER BY RelationshipCount DESC")
+
+pred_counts
+
+
+# category set counts for a single relationship type
+cat_set_rel_counts <- monarch_engine() |>
+ cypher_query_df("MATCH (source)-[rel:`biolink:has_phenotype`]->(target) WITH labels(source) AS SourceLabelList, type(rel) AS Relationship, labels(target) AS DestLabelList, COUNT(*) AS Count RETURN SourceLabelList, Relationship, DestLabelList, Count ORDER BY Count DESC")
+
+cat_set_rel_counts
+
+
+
+# property counts on nodes - random examples
+# MATCH (n:`biolink:Disease`)
+# WITH n, keys(n) AS properties
+# UNWIND properties AS property
+# WITH property, COUNT(n[property]) AS count, collect([n[property], n.id]) AS examples
+# WITH property, count, examples, rand() AS random
+# RETURN property, count, examples[TOINTEGER(random * SIZE(examples))][0] AS example, examples[TOINTEGER(random * SIZE(examples))][1] AS example_id
+# ORDER BY count DESC
+
+# property counts on edges - random examples
+# MATCH ()-[r:`biolink:causes`]->()
+# WITH r, keys(r) AS properties
+# UNWIND properties AS property
+# WITH property, COUNT(r[property]) AS count, collect([r[property], startNode(r).id, endNode(r).id]) AS examples
+# WITH property, count, examples, rand() AS random
+# WITH property, count, examples[TOINTEGER(random * SIZE(examples))] AS example_data
+# RETURN property, count, example_data[0] AS example, example_data[1] AS source_id, example_data[2] AS target_id
+# ORDER BY count DESC
+
+
+#
+query <- "
+// Directly refer known primary categories
+WITH $categories AS KnownCategories
+UNWIND KnownCategories AS CategoryA
+UNWIND KnownCategories AS CategoryB
+WITH DISTINCT CategoryA, CategoryB
+
+// Match and count relationships using primary known categories
+MATCH (a:CategoryA)-[r:`biolink:interacts_with`]->(b:CategoryB)
+RETURN CategoryA, type(r) AS RelationshipType, CategoryB, COUNT(*) AS RelationshipCount
+ORDER BY RelationshipCount DESC
+"
+
+interacts_counts <- cypher_query_df(monarch_engine(), query, parameters = list(categories = as.list(cat_counts$Label)))
+interacts_counts
+
+
+opts <- list(category = as.list(res$Label))
+names(opts$category) <- res$Label
+
+# property counts for nodes of type biolink:GeneOrGeneProduct
+res <- monarch_engine() |>
+ cypher_query_df("MATCH (n)
+ WHERE $label IN n.category
+ WITH n, keys(n) AS properties
+ UNWIND properties AS property
+ WITH property, COUNT(n[property]) AS count, collect([n[property], n.id])[0] AS example_data
+ RETURN property, count, example_data[0] AS example, example_data[1] AS example_id
+ ORDER BY count DESC",
+ parameters = list(label = "biolink:GeneOrGeneProduct"))
+
+
+res
+
+
+# schema graph - not compatible with KGX
+res <- monarch_engine() |>
+ cypher_query("CALL db.schema.visualization() YIELD nodes, relationships RETURN [node IN nodes | node {.*, id: id(node)}] AS nodes, relationships")
+
+
+queries <- paste0("MATCH (n:`",res$Label,"`) RETURN n LIMIT 1")
+
+monarch <- monarch_engine()
+
+fetched <- cypher_query(monarch, queries = queries)
+
+
+###### diversity sampling...
+
+
+
+
+s <- monarch_engine() |> example_graph.neo4j_engine()
+
+cytoscape(s)
+
+
+f <- file_engine(system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr"))
+
+
+s2 <- f |> example_graph.file_engine()
+s2 |> cytoscape()
+
diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml
index 0e1ecb8..3bd8da8 100644
--- a/pkgdown/_pkgdown.yml
+++ b/pkgdown/_pkgdown.yml
@@ -8,3 +8,4 @@ articles:
navbar: Examples
contents:
- examples/alzheimers_phenotypes
+ - examples/exploring_kgs
diff --git a/tests/testthat/test-cypher_query.R b/tests/testthat/test-cypher_query.R
index 24db23a..5843a8c 100644
--- a/tests/testthat/test-cypher_query.R
+++ b/tests/testthat/test-cypher_query.R
@@ -13,4 +13,21 @@ test_that("cypher_query returns a graph object", {
# g should be a tidygraph with two rows in nodes and one in edges
expect_equal(nrow(nodes_df), 2)
expect_equal(nrow(edges_df), 1)
-})
\ No newline at end of file
+})
+
+test_that("cypher_query works with multicypher queries", {
+ # skip for now
+ #testthat::skip("temporary skip")
+
+ e <- monarch_engine()
+ queries <- c("MATCH (n {id: 'MONDO:0007947'}) RETURN n",
+ "MATCH (n {id: 'MONDO:0017309'}) RETURN n",
+ "MATCH (n {id: 'MONDO:0020066'}) RETURN n")
+ g <- cypher_query(e, query = queries)
+ expect_s3_class(g, "tbl_kgx")
+ nodes_df <- data.frame(tidygraph::activate(g, nodes))
+ edges_df <- data.frame(tidygraph::activate(g, edges))
+ # g should be a tidygraph with two rows in nodes and one in edges
+ expect_equal(nrow(nodes_df), 3)
+ expect_equal(nrow(edges_df), 0)
+})
diff --git a/tests/testthat/test-example_graph.file_engine.R b/tests/testthat/test-example_graph.file_engine.R
new file mode 100644
index 0000000..8d3ff90
--- /dev/null
+++ b/tests/testthat/test-example_graph.file_engine.R
@@ -0,0 +1,27 @@
+library(testthat)
+library(assertthat)
+
+test_that("example_graph for file engine", {
+ #testthat::skip("temporary skip")
+ options(width = 150)
+
+ filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+ e <- file_engine(filename)
+
+ sample <- example_graph(e)
+
+ # check some expected categories
+ expect_true(any("biolink:Disease" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category))
+
+ # check some expected predicates
+ expect_true(any("biolink:causes" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate))
+})
diff --git a/tests/testthat/test-example_graph.neo4j_engine.R b/tests/testthat/test-example_graph.neo4j_engine.R
new file mode 100644
index 0000000..43a2f0c
--- /dev/null
+++ b/tests/testthat/test-example_graph.neo4j_engine.R
@@ -0,0 +1,26 @@
+library(testthat)
+library(assertthat)
+
+test_that("example_graph for neo4j engine", {
+ #testthat::skip("temporary skip")
+ options(width = 150)
+
+ e <- monarch_engine()
+
+ sample <- example_graph(e)
+
+ # check some expected categories
+ expect_true(any("biolink:Disease" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category))
+ expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category))
+
+ # check some expected predicates
+ expect_true(any("biolink:causes" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate))
+ expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate))
+})
diff --git a/tests/testthat/test-summary.file_engine.R b/tests/testthat/test-summary.file_engine.R
new file mode 100644
index 0000000..aeec678
--- /dev/null
+++ b/tests/testthat/test-summary.file_engine.R
@@ -0,0 +1,20 @@
+library(testthat)
+library(assertthat)
+
+test_that("summary() for file_engine", {
+ filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+ engine <- file_engine(filename)
+
+ res <- summary(engine, quiet = TRUE)
+
+ # make sure the output is a list
+ expect_type(res, "list")
+
+ # let's try a version where we capture the printed output
+ printed <- capture.output(summary(engine, quiet = FALSE))
+
+ # the result should be a character vector
+ expect_type(printed, "character")
+ # one of the lines should be "Total nodes: "
+ expect_true(any(grepl("Total nodes: ", printed)))
+})
diff --git a/tests/testthat/test-summary.neo4j_engine.R b/tests/testthat/test-summary.neo4j_engine.R
new file mode 100644
index 0000000..a39411e
--- /dev/null
+++ b/tests/testthat/test-summary.neo4j_engine.R
@@ -0,0 +1,17 @@
+library(testthat)
+library(assertthat)
+
+test_that("summary() for neo4j_engine", {
+ res <- summary(monarch_engine(), quiet = TRUE)
+
+ # make sure the output is a list
+ expect_type(res, "list")
+
+ # let's try a version where we capture the printed output
+ printed <- capture.output(summary(monarch_engine(), quiet = FALSE))
+
+ # the result should be a character vector
+ expect_type(printed, "character")
+ # one of the lines should be "Total nodes: "
+ expect_true(any(grepl("Total nodes: ", printed)))
+})
diff --git a/vignettes/examples/exploring_kgs.Rmd b/vignettes/examples/exploring_kgs.Rmd
new file mode 100644
index 0000000..085ecf1
--- /dev/null
+++ b/vignettes/examples/exploring_kgs.Rmd
@@ -0,0 +1,128 @@
+---
+title: "Exploring Knowledge Graphs"
+author: "Shawn T O'Neil"
+date: "Vignette updated: `r format( Sys.Date(), '%b-%d-%Y')`"
+output:
+ rmarkdown::html_document
+vignette: >
+ %\VignetteIndexEntry{monarchr}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+
+```{r, include=FALSE, message=FALSE}
+options(width = 300)
+knitr::opts_chunk$set(eval = TRUE, echo = TRUE, fig.width = 10, message = FALSE, warning = FALSE)
+```
+
+Knowledge Graphs (KGs) may contain large amounts of information; the Monarch Initiative KG for example
+contains not only millions of nodes and edges, but each node may belong to one or more *categories*, across
+dozens of available categories. While edges may only have a single *predicate* linking a subject and object
+node, there are similarly dozens of available predicates. Each node category and edge predicate may further
+come with other node or edge properties, and these may be shared across some but not all node categories or
+edge predicates.
+
+To help navigate this extensive information, `monarchr` provides two functions that may be applied to KG
+engines: a `summary()` function that counts these categories and predicates across nodes and edges, and
+an `example_graph()` function that returns a (non-random) subgraph gauranteed to represent every node
+category and edge predicate.
+
+As usual, we being by loading the `monarchr` package, along with `tidygraph` and `dplyr` which tend to be useful
+(but we will not actually use in this vignette).
+
+```{r}
+library(monarchr)
+library(tidygraph)
+library(dplyr)
+```
+
+
+# Engine `summary()`
+
+The summary function, when applied to a KG engine (like `file_engine()`, `neo4j_engine()`,
+or the cloud-hosted `monarch_engine()`), prints counts of nodes and edges broken out by
+available node category and edge predicate. To keep the information small, we'll produce a summary
+of the included mini-KG containing information about Ehlers-Danlos Syndrome (EDS) and Marfan Syndrome:
+
+```{r}
+filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
+eds_marfan <- file_engine(filename)
+
+summary(eds_marfan)
+```
+
+The printout reports the number of nodes for each category, and the number of edges for each predicate.
+
+This information is also returned (invisibly) as a list; to suppress the printed output we can add `quiet = TRUE`.
+
+```{r}
+s <- summary(eds_marfan, quiet = TRUE)
+
+paste("Total nodes:", s$total_nodes)
+paste("Total edges:", s$total_edges)
+
+head(s$node_summary)
+head(s$edge_summary)
+```
+
+Finally, the returned list also includes `cats` and `pred` entries, which are
+named lists containing all available category and predicate labels for
+convenient auto-completion in your favorite IDE.
+
+
+
+![](exploring_kgs_assets/autocomplete.png)
+
+
+
+The resulting auto-completion inserts the appropriate backtics in RStudio.
+
+```{r}
+diseases <- eds_marfan |>
+ fetch_nodes(query_ids = "MONDO:0009159") |>
+ expand(predicates = s$preds$`biolink:has_phenotype`)
+
+diseases
+```
+
+
+# Engine `example_graph()`
+
+Fetching a sample of data from a KG is another convenient way to explore its
+contents, but a random sample is unlikely to illustrate the diversity of available
+node categories, edge predicates, and information associated with nodes and
+edges of different types.
+
+To serve this need `monarchr` provides an `example_graph()` function, which
+fetches a sample of nodes and edges that are guaranteed to represent every
+available category and every available predicate. When using this method,
+it is important to remember that nodes frequently belong to multiple categories,
+and the `pcategory` ("primary category") column represents one of a set of
+chosen categories to represent the node. The choice of category shown in `pcategory`
+is defined by `monarchr`, not the KG itself, and is configurable.
+
+```{r}
+ex_g <- eds_marfan |> example_graph()
+ex_g
+```
+
+Note that this method makes no other guarantees: the sample is not random,
+the resulting graph may not be connected, the result is not the smallest possible
+graph that contains all categories and predicates, and nodes and edges may not
+contain complete data for all possible properties for their respective types.
+Still, browsing the resulting graph in tabular form as above can quickly reveal
+the bulk of information available in a KG for further targeted exploration
+with `fetch_nodes()` and `expand()`.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/vignettes/examples/exploring_kgs_assets/autocomplete.png b/vignettes/examples/exploring_kgs_assets/autocomplete.png
new file mode 100644
index 0000000..15568f8
Binary files /dev/null and b/vignettes/examples/exploring_kgs_assets/autocomplete.png differ