Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graph explore #48

Merged
merged 15 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add example_graph() methods
  • Loading branch information
oneilsh committed Oct 30, 2024
commit 67bc763aaf9b1c39049d26709181dfc2f9b7b165
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Type: Package
Title: Monarch Knowledge Graph Queries
Description: R package for easy access, manipulation, and analysis of
Monarch KG data Resources.
Version: 1.3.0
Version: 1.4.0
URL: https://github.com/monarch-initiative/monarchr
BugReports: https://github.com/monarch-initiative/monarchr/issues
Authors@R:
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
S3method(cypher_query,neo4j_engine)
S3method(cypher_query_df,neo4j_engine)
S3method(edges,tbl_kgx)
S3method(example_graph,file_engine)
S3method(example_graph,neo4j_engine)
S3method(expand,tbl_kgx)
S3method(explode,tbl_kgx)
S3method(fetch_nodes,file_engine)
Expand All @@ -19,6 +21,7 @@ export("%~%")
export(cypher_query)
export(cypher_query_df)
export(edges)
export(example_graph)
export(expand)
export(explode)
export(fetch_nodes)
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# monarchr 1.4.0

Dev branch graph_explore notes:
## New features

* `example_graph()` function for engines

# monarchr 1.3.0

Expand Down
31 changes: 31 additions & 0 deletions R/example_graph.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' Return an example set of nodes from a KG engine.
#'
#' Given a KG engine, returns a graph representing the diversity
#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
#' contain at least one node of every category, and at least one edge of every
#' predicate. No other guarantees are made: the example graph is not minimal
#' to satisfy these criteria, it is not random or even pseudo-random, and it
#' may not be connected.
#'
#' @param engine A KG engine object
#' @param ... Other parameters (not used)
#' @return A tbl_kgx graph
#' @export
#' @examples
#' # Using example KGX file packaged with monarchr
#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
#'
#' # prints a readable summary and returns a list of dataframes
#' g <- file_engine(filename) |> example_graph()
#' print(g)
#'
#' @examplesIf monarch_engine_check()
#' # prints a readable summary and returns a list of dataframes
#' g <- monarch_engine() |> example_graph()
#' print(g)
#' @import tidygraph
#' @import dplyr
example_graph <- function(engine, ...) {
UseMethod("example_graph")
}

99 changes: 99 additions & 0 deletions R/example_graph.file_engine.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#' Return an example set of nodes from a KG engine.
#'
#' Given a KGX file-based KG engine, returns a graph representing the diversity
#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
#' contain at least one node of every category, and at least one edge of every
#' predicate. No other guarantees are made: the example graph is not minimal
#' to satisfy these criteria, it is not random or even pseudo-random, and it
#' may not be connected.
#'
#' @param engine A `file_engine` object
#' @param ... Other parameters (not used)
#' @return A tbl_kgx graph
#' @export
#' @examples
#' # Using example KGX file packaged with monarchr
#' filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
#'
#' # Retrieve and print an example graph:
#' g <- file_engine(filename) |> example_graph()
#' print(g)
#' @import tidygraph
#' @import dplyr
example_graph.file_engine <- function(engine, ...) {
# first, let's discover the different edge types (predicates) available
edges_df <- engine$graph |>
activate(edges) |>
as.data.frame()

nodes_df <- engine$graph |>
activate(nodes) |>
as.data.frame()


pred_types <- edges_df |>
pull(predicate) |>
unique()

# next we get a bunch of edges of the different predicate types as a graph
sample_edges <- edges_df |>
group_by(predicate) |>
slice(1) |>
ungroup() |>
select(-to, -from)

sample_nodes <- nodes_df |>
filter(id %in% sample_edges$subject | id %in% sample_edges$object)

sample_preds_graph <- tbl_kgx(nodes = sample_nodes, edges = sample_edges, attach_engine = engine)

# this might not represent all categories however.
# So we compute the categories that are represented thus far
used_categories <- sample_preds_graph |>
activate(nodes) |>
as.data.frame() |>
pull(category) |>
unlist() |>
unique()

# get the available categories
all_node_categories <- nodes_df |>
pull(category) |>
unlist() |>
unique()


# compute the node categories that are still needed
needed_categories <- setdiff(all_node_categories, used_categories)

# now sample nodes of those categories, and an arbitrary connection
# trouble is, nodes_df$category is a list column...
sample_cats_node_ids <- needed_categories |> lapply(function(cat) {
has_cat_rows <- which(cat %in_list% nodes_df$category)
return(nodes_df$id[has_cat_rows[1]])
# unique because a single node may be selected to represent multiple needed categories
}) |> unlist() |> unique()

# ok, we have ids that cover the needed categories. Let's grab one row from
# the edges table for each (id could be subject or object)
sample_cats_edges_list <- sample_cats_node_ids |> lapply(function(node_id) {
row <- edges_df |>
filter(node_id == subject | node_id == object) |>
head(n = 1) |>
select(-to, -from)
})
sample_cats_edges <- do.call(rbind, sample_cats_edges_list)

# now we need to select the corresponding nodes via their ids
sample_cats_all_ids <- c(sample_cats_edges$subject, sample_cats_edges$object) |>
unique()

sample_cats_nodes <- nodes_df |>
filter(id %in% sample_cats_all_ids)

# and join it all up
sample_cats_graph <- tbl_kgx(nodes = sample_cats_nodes, edges = sample_cats_edges, attach_engine = engine, )

suppressMessages(all <- kg_join(sample_cats_graph, sample_preds_graph), classes = "message")
return(all)
}
56 changes: 56 additions & 0 deletions R/example_graph.neo4j_engine.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#' Return an example set of nodes from a KG engine.
#'
#' Given a KGX Neo4j KG engine, returns a graph representing the diversity
#' of node categories and edge predicates for browsing. The returned graph is guaranteed to
#' contain at least one node of every category, and at least one edge of every
#' predicate. No other guarantees are made: the example graph is not minimal
#' to satisfy these criteria, it is not random or even pseudo-random, and it
#' may not be connected.
#'
#' @param engine A `neo4j_engine` object
#' @param ... Other parameters (not used)
#' @return A tbl_kgx graph
#' @export
#' @examplesIf monarch_engine_check()
#' # Retrieve and print an example graph:
#' g <- monarch_engine() |> example_graph()
#' print(g)
#' @import tidygraph
#' @import dplyr
example_graph.neo4j_engine <- function(engine, ...) {
# first, let's discover the different edge types (predicates) available from the schema info
pred_types_query <- "CALL db.schema.visualization() YIELD relationships
UNWIND relationships AS rel
RETURN DISTINCT type(rel) AS predicate"
pred_types <- cypher_query_df(engine, pred_types_query)

# next we get a bunch of edges of the different predicate types as a graph
sample_preds_query <- paste0("MATCH (a)-[r:`", pred_types$predicate, "`]->(b) RETURN a, b, r LIMIT 1")
sample_preds_graph <- cypher_query(engine, query = sample_preds_query)

# this might not represent all categories however.

# So we compute the categories that are represented thus far
used_categories <- sample_preds_graph |>
activate(nodes) |>
as.data.frame() |>
pull(category) |>
unlist() |>
unique()

# get the available categories from the schema
categories_query <- "CALL db.labels() YIELD label RETURN DISTINCT label"
all_node_categories <- cypher_query_df(engine, categories_query)$label

# compute the node categories that are still needed
needed_categories <- setdiff(all_node_categories, used_categories)

# now sample nodes of those categories, and an arbitrary connection
sample_cats_query <- paste0("MATCH (a:`", needed_categories, "`) -[r]- (b) RETURN a, r, b LIMIT 1")
sample_new_cats <- cypher_query(engine, query = sample_cats_query)

# finally, we join the two and return
suppressMessages(full_sample <- kg_join(sample_preds_graph, sample_new_cats), classes = "message")

return(full_sample)
}
38 changes: 38 additions & 0 deletions man/example_graph.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions man/example_graph.file_engine.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions man/example_graph.neo4j_engine.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions tests/testthat/test-example_graph.file_engine.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
library(testthat)
library(assertthat)

test_that("example_graph for file engine", {
#testthat::skip("temporary skip")
options(width = 150)

filename <- system.file("extdata", "eds_marfan_kg.tar.gz", package = "monarchr")
e <- file_engine(filename)

sample <- example_graph(e)

# check some expected categories
expect_true(any("biolink:Disease" %in_list% nodes(sample)$category))
expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category))
expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category))
expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category))
expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category))
expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category))

# check some expected predicates
expect_true(any("biolink:causes" %in_list% edges(sample)$predicate))
expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate))
expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate))
expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate))
expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate))
})
26 changes: 26 additions & 0 deletions tests/testthat/test-example_graph.neo4j_engine.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
library(testthat)
library(assertthat)

test_that("example_graph for neo4j engine", {
#testthat::skip("temporary skip")
options(width = 150)

e <- monarch_engine()

sample <- example_graph(e)

# check some expected categories
expect_true(any("biolink:Disease" %in_list% nodes(sample)$category))
expect_true(any("biolink:GenomicEntity" %in_list% nodes(sample)$category))
expect_true(any("biolink:GeneOrGeneProduct" %in_list% nodes(sample)$category))
expect_true(any("biolink:SequenceVariant" %in_list% nodes(sample)$category))
expect_true(any("biolink:OntologyClass" %in_list% nodes(sample)$category))
expect_true(any("biolink:PhysicalEssence" %in_list% nodes(sample)$category))

# check some expected predicates
expect_true(any("biolink:causes" %in_list% edges(sample)$predicate))
expect_true(any("biolink:subclass_of" %in_list% edges(sample)$predicate))
expect_true(any("biolink:associated_with_increased_likelihood_of" %in_list% edges(sample)$predicate))
expect_true(any("biolink:treats_or_applied_or_studied_to_treat" %in_list% edges(sample)$predicate))
expect_true(any("biolink:genetically_associated_with" %in_list% edges(sample)$predicate))
})