From 658df83626c9710a08920c00f7c16d48ce708778 Mon Sep 17 00:00:00 2001 From: Markus Binsteiner Date: Mon, 20 Nov 2023 12:28:48 +0100 Subject: [PATCH] docs: add some usage documentation --- CHANGELOG.md | 4 +- docs/usage.md | 209 +++++++++++++++++- .../example_pipeline_network_analysis.yaml | 25 --- .../network_analysis/data_types.py | 103 ++++++++- .../network_analysis/models/__init__.py | 33 ++- .../network_analysis/modules/components.py | 4 +- 6 files changed, 343 insertions(+), 35 deletions(-) delete mode 100644 examples/pipelines/example_pipeline_network_analysis.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index a3d6b0e..8091e44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,6 @@ Changelog ========= -## Version 0.0.1 (Upcoming) +## Version 0.5.1 (Upcoming) -- first release of *kiara_plugin.network_analysis* +- rename 'network_data.extract_components' to `network_data.calculate_components` diff --git a/docs/usage.md b/docs/usage.md index 186c672..7a12ec0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,4 +1,211 @@ # Usage +## Introduction -TO BE DONE +## The `network_data` type + +If you access the `.data` attribute of a value of the `network_data` type, you will get a Python instance of the class [`NetworkData`](https://github.com/DHARPA-Project/kiara_plugin.network_analysis/blob/develop/src/kiara_plugin/network_analysis/models/__init__.py). + +In Python, this would look something like: + +``` +from kiara.api import KiaraAPI +from kiara_plugin.network_analysis.models import NetworkData + +kiara = KiaraAPI.instance() +network_data_value = api.get_value("my_network_data_alias_or_id") + +network_data: NetworkData = network_data_value.data +``` + +or, from within a module `process` method: + +``` +from kiara.api import ValueMap, Value +from kiara_plugin.network_analysis.models import NetworkData + +def process(self, inputs: ValueMap, outputs: ValueMap): + + network_data_obj = inputs.get_value_obj("network_data_input_field_name") + network_data: NetworkData = network_data_obj.data +``` + +This is a wrapper class that stores all the data related to the nodes and edges of the network data in two separate tables (inheriting from [`KiaraTables`](https://github.com/DHARPA-Project/kiara_plugin.tabular/blob/develop/src/kiara_plugin/tabular/models/tables.py), which in turn uses [`KiaraTable`](https://github.com/DHARPA-Project/kiara_plugin.tabular/blob/develop/src/kiara_plugin/tabular/models/table.py) to store the actual per-table data). + +The only two tables that are available in a `NetworkData` instance are called `nodes` and `edges`. You can access them via the `.nodes` and `.edges` attributes of the `NetworkData` instance. As mentioned above, Both of these attributes are instances of `KiaraTable`, so you can use all the methods of that class to access the data. The most important ones are: + +- `.arrow_table`: to get the data as an [Apache Arrow](https://arrow.apache.org/) table +- `.to_pandas_dataframe()`: to get the data as a [pandas](https://pandas.pydata.org/) dataframe -- please try to always use the arrow table, as it is much more efficient and avoides loading the whole data into memory in some cases + +As a convention, *kiara* will add columns prefixed with an underscore if the values in it have internal 'meaning', normal/original attributes are stored in columns without that prefix. + +Both node and edge tables contain a unique `id` column (`_node_id`, `_edge_id`) that is generated for eacch specific network_data instance. You can not rely on this id being consistent across network_data values (e.g. if you create a filtered `network_data` instance from another one, the same node_id will most likely not refer to the original node row). + +### The 'edges' table + +The `edges` table contains the data about the edges of the network. The most important columns are: + +- `_source`: the source node ids of the edge +- `_target`: the target node ids of the edge + +In addition, this table contains a number of pre-processed, static metadata concerning this specific `network_data` instance. You can get information about those using the cli command: + +``` +kiara data-type explain network_data +``` + +The `nodes' table contains the data about node attributes of the network. The `_node_id` column contains node ids that reference the `_source`/`_target` columns of the `edges` table. + +The table also contains additional pre-processed, static metadata for this specific `network_data` instance, which can be accessed using the same cli command as above. + +## `network_data`-specific metadata + +Along the pre-processed edge- and node- metadata, a `network_data` value also comes with some more general, pre-processed metadata: + +``` +kiara data explain -p journals_network + +... +... +properties: + "metadata.network_data": { + "number_of_nodes": 276, + "properties_by_graph_type": { + "directed": { + "number_of_edges": 321, + "parallel_edges": 0 + }, + "directed_multi": { + "number_of_edges": 321, + "parallel_edges": 0 + }, + "undirected": { + "number_of_edges": 313, + "parallel_edges": 0 + }, + "undirected_multi": { + "number_of_edges": 321, + "parallel_edges": 8 + } + }, + "number_of_self_loops": 1 + } +... +... + +``` + +In a *kiara* module you'd access this information like: + +```python + +def process(self, inputs: ValueMap, outputs: ValueMap): + + network_data_obj: Value = inputs.get_value_obj("network_data_input_field_name") + network_props = network_data_obj.get_property_data('metadata.network_data') +``` + +This gives you information about the number of edges (and parallel edges), depending as which graph type you interpret the data itself. For example, the 'undirected' graph type would merge all the edges that have the same source/target and target/source combinations into a single edge, whereas the 'directed' graph type would keep them separate. + +In addition, you can also retrieve the more generic table column metadata for the `nodes` and `edges` tables: + +```python + +table_props = network_data_obj.get_property_data('metadata.tables') +``` + +This can be useful for non-auto-pre-processed node/edge attributes that where copied over from the original data, or just to get +an idea about the general shape of the data. + + +## Creating a `NetworkData` instance in a *kiara* module + +*kiara* tries to make assembling `network_data` as easy as possible for a module developer (this should only ever happen within the context of a module). + +The default way to assemble a `network_data` value is to use the `create_network_data` class method of the [`NetworkData`](https://github.com/DHARPA-Project/kiara_plugin.network_analysis/blob/develop/src/kiara_plugin/network_analysis/models/__init__.py) class: + +This method is the most flexible and powerful, which means it also requires some preparation of the data, and the data to be in a specific format. To make this easier, there exists a convenience method to create a `network_data` value from an existing `networkx` graph: + +```python +def create_from_networkx_graph( + cls, + graph: "nx.Graph", + label_attr_name: Union[str, None] = None, + ignore_node_attributes: Union[Iterable[str], None] = None, + ) -> "NetworkData": +``` + +In addition, there exists a helper function that lets you create a `network_data` instance from an existing one, in addition to a list of node_ids the new graph should contain (nodes/edges containing ids not in that list will be not included in the new graph) + +```python +def from_filtered_nodes( + cls, network_data: "NetworkData", nodes_list: List[int] +) -> "NetworkData": +``` + + +## Assembling a `network_data` value in a workflow + +The central operation that is used to assemble a `network_data` value is called `assemble.network_data`: + +``` +❯ kiara operation explain assemble.network_data + +╭─ Operation: assemble.network_data ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ │ +│ Documentation Create a 'network_data' instance from one or two tables. │ +│ │ +│ This module needs at least one table as input, providing the edges of the resulting network data set. │ +│ If no further table is created, basic node information will be automatically created by using unique values from the edges │ +│ source and target columns. │ +│ │ +│ If no `source_column_name` (and/or `target_column_name`) is provided, *kiara* will try to auto-detect the most likely of the │ +│ existing columns to use. If that is not possible, an error will be raised. │ +│ │ +│ Inputs │ +│ field name type description Required Default │ +│ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ +│ edges table A table that contains the edges data. yes -- no default -- │ +│ source_column string The name of the source column name in the edges table. no -- no default -- │ +│ target_column string The name of the target column name in the edges table. no -- no default -- │ +│ edges_column_map dict An optional map of original column name to desired. no -- no default -- │ +│ nodes table A table that contains the nodes data. no -- no default -- │ +│ id_column string The name (before any potential column mapping) of the no -- no default -- │ +│ node-table column that contains the node identifier (used in │ +│ the edges table). │ +│ label_column string The name of a column that contains the node label (before any no -- no default -- │ +│ potential column name mapping). If not specified, the value of │ +│ the id value will be used as label. │ +│ nodes_column_map dict An optional map of original column name to desired. no -- no default -- │ +│ │ +│ │ +│ Outputs │ +│ field name type description │ +│ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ +│ network_data network_data The network/graph data. │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + +``` + +This assumes the user has already imported at least a table containing edge data, which in turn is used in the `edges` input field. Providing a 'nodes' information table is optional. + +The second option of creating a `network_data` value is to use the `create.network_data.from.file` operation, which takes a (raw) `file` as input. This file needs to contain network data in one of the supported formats (e.g. 'gml, 'gexf', 'graphml', ... -- use 'explain' on the operation to get the latest list of supported formats). + + +## Other perations for `network_data` values + +The following operations are available for `network_data` values. Use the `operation explain` command to get more information about them. + +### `export.network_data.*` + +Those operations take an existing `network_data` instance and export it as afile (or files) to the local filesystem, optionally including *kiara* specific metadata. + +### `network_data.calculate_components` + +Add a `_component_id` column to the nodes table indicating which (separate) component it belongs to, for single component networks thie value will be '0' for every node. + +### `network_data_filter.component` + +Filter a `network_data` instance by extracting a single component. diff --git a/examples/pipelines/example_pipeline_network_analysis.yaml b/examples/pipelines/example_pipeline_network_analysis.yaml deleted file mode 100644 index 2088a8b..0000000 --- a/examples/pipelines/example_pipeline_network_analysis.yaml +++ /dev/null @@ -1,25 +0,0 @@ -pipeline_name: example_pipeline_network_analysis -doc: Example pipeline for the network_analysis plugin. -steps: - - step_id: add_hello_string - module_type: example_proj.example - module_config: - separator: " " - constants: - text_1: "Hello" - defaults: - text_2: "World" - - step_id: add_exclamation_mark - module_type: example_proj.example - module_config: - separator: "" - constants: - text_2: "!" - input_links: - text_1: add_hello_string.text - -input_aliases: - add_hello_string.text_2: name - -output_aliases: - add_exclamation_mark.text: greeting diff --git a/src/kiara_plugin/network_analysis/data_types.py b/src/kiara_plugin/network_analysis/data_types.py index f76f336..5f7f541 100644 --- a/src/kiara_plugin/network_analysis/data_types.py +++ b/src/kiara_plugin/network_analysis/data_types.py @@ -11,10 +11,21 @@ from kiara.models.values.value import Value from kiara.utils.output import ArrowTabularWrap from kiara_plugin.network_analysis.defaults import ( + CONNECTIONS_COLUMN_NAME, + CONNECTIONS_MULTI_COLUMN_NAME, + COUNT_DIRECTED_COLUMN_NAME, + COUNT_IDX_DIRECTED_COLUMN_NAME, + COUNT_IDX_UNDIRECTED_COLUMN_NAME, + COUNT_UNDIRECTED_COLUMN_NAME, + EDGE_ID_COLUMN_NAME, EDGES_TABLE_NAME, + IN_DIRECTED_COLUMN_NAME, + IN_DIRECTED_MULTI_COLUMN_NAME, LABEL_COLUMN_NAME, NODE_ID_COLUMN_NAME, NODES_TABLE_NAME, + OUT_DIRECTED_COLUMN_NAME, + OUT_DIRECTED_MULTI_COLUMN_NAME, SOURCE_COLUMN_NAME, TARGET_COLUMN_NAME, ) @@ -26,16 +37,106 @@ class NetworkDataType(TablesType): """Data that can be assembled into a graph. - This data type extends the 'database' type from the [kiara_plugin.tabular](https://github.com/DHARPA-Project/kiara_plugin.tabular) plugin, restricting the allowed tables to one called 'edges', + This data type extends the 'tables' type from the [kiara_plugin.tabular](https://github.com/DHARPA-Project/kiara_plugin.tabular) plugin, restricting the allowed tables to one called 'edges', and one called 'nodes'. """ _data_type_name: ClassVar[str] = "network_data" + _cached_doc: ClassVar[Union[str, None]] = None @classmethod def python_class(cls) -> Type: return NetworkData # type: ignore + @classmethod + def type_doc(cls) -> str: + + if cls._cached_doc: + return cls._cached_doc + + from kiara_plugin.network_analysis.models.metadata import ( + EDGE_COUNT_DUP_DIRECTED_COLUMN_METADATA, + EDGE_COUNT_DUP_UNDIRECTED_COLUMN_METADATA, + EDGE_ID_COLUMN_METADATA, + EDGE_IDX_DUP_DIRECTED_COLUMN_METADATA, + EDGE_IDX_DUP_UNDIRECTED_COLUMN_METADATA, + EDGE_SOURCE_COLUMN_METADATA, + EDGE_TARGET_COLUMN_METADATA, + NODE_COUND_EDGES_MULTI_COLUMN_METADATA, + NODE_COUNT_EDGES_COLUMN_METADATA, + NODE_COUNT_IN_EDGES_COLUMN_METADATA, + NODE_COUNT_IN_EDGES_MULTI_COLUMN_METADATA, + NODE_COUNT_OUT_EDGES_COLUMN_METADATA, + NODE_COUNT_OUT_EDGES_MULTI_COLUMN_METADATA, + NODE_ID_COLUMN_METADATA, + NODE_LABEL_COLUMN_METADATA, + ) + + edge_properties = {} + edge_properties[EDGE_ID_COLUMN_NAME] = EDGE_ID_COLUMN_METADATA.doc.full_doc + edge_properties[SOURCE_COLUMN_NAME] = EDGE_SOURCE_COLUMN_METADATA.doc.full_doc + edge_properties[TARGET_COLUMN_NAME] = EDGE_TARGET_COLUMN_METADATA.doc.full_doc + edge_properties[ + COUNT_DIRECTED_COLUMN_NAME + ] = EDGE_COUNT_DUP_DIRECTED_COLUMN_METADATA.doc.full_doc + edge_properties[ + COUNT_IDX_DIRECTED_COLUMN_NAME + ] = EDGE_IDX_DUP_DIRECTED_COLUMN_METADATA.doc.full_doc + edge_properties[ + COUNT_UNDIRECTED_COLUMN_NAME + ] = EDGE_COUNT_DUP_UNDIRECTED_COLUMN_METADATA.doc.full_doc + edge_properties[ + COUNT_IDX_UNDIRECTED_COLUMN_NAME + ] = EDGE_IDX_DUP_UNDIRECTED_COLUMN_METADATA.doc.full_doc + + properties_node = {} + properties_node[NODE_ID_COLUMN_NAME] = NODE_ID_COLUMN_METADATA.doc.full_doc + properties_node[LABEL_COLUMN_NAME] = NODE_LABEL_COLUMN_METADATA.doc.full_doc + properties_node[ + CONNECTIONS_COLUMN_NAME + ] = NODE_COUNT_EDGES_COLUMN_METADATA.doc.full_doc + properties_node[ + CONNECTIONS_MULTI_COLUMN_NAME + ] = NODE_COUND_EDGES_MULTI_COLUMN_METADATA.doc.full_doc + properties_node[ + IN_DIRECTED_COLUMN_NAME + ] = NODE_COUNT_IN_EDGES_COLUMN_METADATA.doc.full_doc + properties_node[ + IN_DIRECTED_MULTI_COLUMN_NAME + ] = NODE_COUNT_IN_EDGES_MULTI_COLUMN_METADATA.doc.full_doc + properties_node[ + OUT_DIRECTED_COLUMN_NAME + ] = NODE_COUNT_OUT_EDGES_COLUMN_METADATA.doc.full_doc + properties_node[ + OUT_DIRECTED_MULTI_COLUMN_NAME + ] = NODE_COUNT_OUT_EDGES_MULTI_COLUMN_METADATA.doc.full_doc + + edge_properties_str = "\n\n".join( + f"***{key}***:\n\n{value}" for key, value in edge_properties.items() + ) + node_properties_str = "\n\n".join( + f"***{key}***:\n\n{value}" for key, value in properties_node.items() + ) + + doc = cls.__doc__ + doc_tables = f""" + +## Edges +The 'edges' table contains the following columns: + +{edge_properties_str} + +## Nodes + +The 'nodes' table contains the following columns: + +{node_properties_str} + +""" + + cls._cached_doc = f"{doc}\n\n{doc_tables}" + return cls._cached_doc + def parse_python_obj(self, data: Any) -> NetworkData: if isinstance(data, KiaraTables): diff --git a/src/kiara_plugin/network_analysis/models/__init__.py b/src/kiara_plugin/network_analysis/models/__init__.py index b63e41d..f7ef854 100644 --- a/src/kiara_plugin/network_analysis/models/__init__.py +++ b/src/kiara_plugin/network_analysis/models/__init__.py @@ -81,9 +81,9 @@ def __call__(self, _source: int, _target: int, **kwargs) -> None: class NetworkData(KiaraTables): - """A helper class to access and query network datasets. + """A wrapper class to access and query network datasets. - This class provides different ways to access the underlying network data, most notably via sql and as networkx Graph object. + This class provides different ways to access the underlying network data, most notably via sql and as rustworkx (also networkx) Graph object. Internally, network data is stored as 2 Arrow tables with the edges stored in a table called 'edges' and the nodes in a table called 'nodes'. The edges table must have (at least) the following columns: '_source', '_target'. The nodes table must have (at least) the following columns: '_id' (integer), '_label' (string). @@ -106,7 +106,17 @@ def create_network_data( This method requires the nodes to have an "_id' column (int) as well as a '_label' one (utf8). The edges table needs at least a '_source' (int) and '_target' (int) column. - This method will augment both tables with additional columns that are required for the internal representation (weights, degrees). + This method can augment both tables with additional columns that are required for the internal representation (id, counts, etc). + + If you specify additional metadata, it will be attached to the columns of the tables. This is useful if you want to add metadata that was part of the original data, and that you want to be available when the data is used in a network analysis (for example existing weight data). The format of that additional metadata is a dict of dicts, with the + root key the column name, the second key the property name, and the value the property value. + + Arguments: + nodes_table: the table containing the nodes data + edges_table: the table containing the edges data + augment_tables: whether to augment the tables with pre-processed edge/node metadata (in most cases you want to do this, except if you know the metadata is already present and correct) + nodes_column_metadata: additional metadata to attach to the nodes table columns + edges_column_metadata: additional metadata to attach to the edges table columns """ from kiara_plugin.network_analysis.models.metadata import ( @@ -249,6 +259,14 @@ def create_network_data( def from_filtered_nodes( cls, network_data: "NetworkData", nodes_list: List[int] ) -> "NetworkData": + """Create a new, filtered instance of this class using a source network, and a list of node ids to include. + + Nodes/edges containing a node id not in the list will be removed from the resulting network data. + + Arguments: + network_data: the source network data + nodes_list: the list of node ids to include in the filtered network data + """ import duckdb import polars as pl @@ -304,7 +322,14 @@ def create_from_networkx_graph( label_attr_name: Union[str, None] = None, ignore_node_attributes: Union[Iterable[str], None] = None, ) -> "NetworkData": - """Create a `NetworkData` instance from a networkx Graph object.""" + """Create a `NetworkData` instance from a networkx Graph object. + + Arguments: + graph: the networkx graph instance + label_attr_name: the name of the node attribute that contains the node label (if None, the node id is used as label) + ignore_node_attributes: a list of node attributes that should be ignored and not added to the table + + """ # TODO: should we also index nodes/edges attributes? diff --git a/src/kiara_plugin/network_analysis/modules/components.py b/src/kiara_plugin/network_analysis/modules/components.py index ce39b34..155a5e8 100644 --- a/src/kiara_plugin/network_analysis/modules/components.py +++ b/src/kiara_plugin/network_analysis/modules/components.py @@ -33,14 +33,14 @@ class ExtractLargestComponentModule(KiaraModule): - """Extract the largest connected component from this network data. + """Calculate component information for this network data. This module analyses network data and checks if it contains clusters, and if so, how many. If all nodes are connected, all nodes will have '0' as value in the component_id field. Otherwise, the nodes will be assigned 'component_id'-s according to the component they belong to, with the largest component having '0' as component_id, the second largest '1' and so on. If two components have the same size, who gets the higher component_id is not determinate. """ - _module_type_name = "network_data.extract_components" + _module_type_name = "network_data.calculate_components" def create_inputs_schema( self,