Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/mx-1673 full text search support #226

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6dc01ae
update searchable fields for full text search
mr-kamran-ali Sep 17, 2024
1e38fe9
Merge branch 'main' of https://github.com/robert-koch-institut/mex-ba…
mr-kamran-ali Oct 1, 2024
7757aac
Add search in nested items
mr-kamran-ali Oct 1, 2024
7e2d957
add more integration tests
mr-kamran-ali Oct 1, 2024
efa4617
add search in nested nodes to fetch merged items query
mr-kamran-ali Oct 1, 2024
a5d7708
add more integration tests
mr-kamran-ali Oct 15, 2024
063ccc7
remove hardcoded ids from query
mr-kamran-ali Oct 15, 2024
abcb21c
Merge branch 'main' of https://github.com/robert-koch-institut/mex-ba…
mr-kamran-ali Oct 28, 2024
2373f90
update query builder test
mr-kamran-ali Oct 28, 2024
21e8772
unify variable names in queries
cutoffthetop Nov 21, 2024
afbfe0f
Merge branch 'main' of https://github.com/robert-koch-institut/mex-ba…
cutoffthetop Nov 21, 2024
417ebda
mob code-review
cutoffthetop Nov 21, 2024
32f5595
feature/mx-1673 prepare fulltext search
cutoffthetop Jan 6, 2025
fb140b7
fix test
cutoffthetop Jan 6, 2025
9d6bd94
Changelog
cutoffthetop Jan 6, 2025
596f3bf
Merge branch 'feature/mx-1673-renaming' into feature/mx-1673-add-full…
cutoffthetop Jan 6, 2025
e6afc76
write cypher keywords as uppercase
cutoffthetop Jan 6, 2025
531b451
some more renaming
cutoffthetop Jan 6, 2025
45e7b6b
Merge branch 'feature/mx-1673-renaming' into feature/mx-1673-full-tex…
cutoffthetop Jan 6, 2025
647c695
clean up
cutoffthetop Jan 6, 2025
4691c91
fix sorting of extracted_or_rule_node
cutoffthetop Jan 6, 2025
5739436
fix up look sharp
cutoffthetop Jan 7, 2025
03494a6
add two matched organizations to the test dummy data
cutoffthetop Jan 7, 2025
f8e11e1
fix tests
cutoffthetop Jan 9, 2025
06f2b20
Merge branch 'main' of https://github.com/robert-koch-institut/mex-ba…
cutoffthetop Jan 9, 2025
58f58e1
Merge branch 'feature/mx-1673-matched-test-data' into feature/mx-1673…
cutoffthetop Jan 9, 2025
26ec8a2
simplify diff
cutoffthetop Jan 9, 2025
c780555
optimize queries, update tests
cutoffthetop Jan 10, 2025
ca63a8a
Merge branch 'main' of https://github.com/robert-koch-institut/mex-ba…
cutoffthetop Jan 10, 2025
fb0b718
add more extracted search tests
cutoffthetop Jan 10, 2025
29088a1
add more tests
cutoffthetop Jan 10, 2025
813df5f
CL
cutoffthetop Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- rename short and obscure cypher query variables to more expressive and verbose ones
- rename `stable_target_id` to more appropriate `identifier` argument for merged queries
- add support for full text queries on nested models to find extracted/rule/merged items
- optimize extracted/rule/merged search queries by applying sorting and pagination
before pulling in nested models as well as identifiers from referenced merged items
and by replacing subqueries with cypher "pattern comprehension" syntax
- prefix `components` in merged queries with `_`, to be more harmonious with `_refs`
- add email fields to `SEARCHABLE_FIELDS` and `SEARCHABLE_CLASSES` (stop-gap MX-1766)

### Deprecated

Expand Down
18 changes: 15 additions & 3 deletions mex/backend/fields.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,27 @@
from mex.common.fields import STRING_FIELDS_BY_CLASS_NAME
from itertools import chain

from mex.common.fields import EMAIL_FIELDS_BY_CLASS_NAME, STRING_FIELDS_BY_CLASS_NAME

# fields that should be indexed as searchable fields
SEARCHABLE_FIELDS = sorted(
{
field_name
for field_names in STRING_FIELDS_BY_CLASS_NAME.values()
for field_names in chain(
STRING_FIELDS_BY_CLASS_NAME.values(),
EMAIL_FIELDS_BY_CLASS_NAME.values(), # stopgap MX-1766
)
for field_name in field_names
}
)

# classes that have fields that should be searchable
SEARCHABLE_CLASSES = sorted(
{name for name, field_names in STRING_FIELDS_BY_CLASS_NAME.items() if field_names}
{
name
for name, field_names in chain(
STRING_FIELDS_BY_CLASS_NAME.items(),
EMAIL_FIELDS_BY_CLASS_NAME.items(), # stopgap MX-1766
)
if field_names
}
)
2 changes: 1 addition & 1 deletion mex/backend/graph/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def fetch_merged_items(
)
for query_result in result.all():
for item in query_result["items"]:
for component in item["components"]:
for component in item["_components"]:
expand_references_in_search_result(component)
return result

Expand Down
76 changes: 43 additions & 33 deletions mex/backend/graph/cypher/fetch_extracted_or_rule_items.cql
Original file line number Diff line number Diff line change
Expand Up @@ -21,45 +21,55 @@ CALL () {
<%- if filter_by_query_string %>
OPTIONAL CALL db.index.fulltext.queryNodes("search_index", $query_string)
YIELD node AS hit, score
<%- endif %>
OPTIONAL MATCH (extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)
<%- if filter_by_stable_target_id -%>
-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
<%- endif %>
<%- set and_ = joiner("AND ") %>
CALL (hit) {
MATCH (extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
WHERE
elementId(hit) = elementId(extracted_or_rule_node)
AND ANY(label IN labels(extracted_or_rule_node) WHERE label IN $labels)
<%- if filter_by_stable_target_id %>
AND merged_node.identifier = $stable_target_id
<%- endif %>
RETURN extracted_or_rule_node, merged_node
UNION
MATCH (nested_node:<<nested_labels|join("|")>>)<-[]-(extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
WHERE
elementId(hit) = elementId(nested_node)
AND ANY(label IN labels(extracted_or_rule_node) WHERE label IN $labels)
<%- if filter_by_stable_target_id %>
AND merged_node.identifier = $stable_target_id
<%- endif %>
RETURN extracted_or_rule_node, merged_node
}
WITH DISTINCT extracted_or_rule_node, merged_node
<%- else %>
OPTIONAL MATCH (extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
WHERE
<%- if filter_by_query_string %>
<<and_()>>elementId(hit) = elementId(extracted_or_rule_node)
<%- endif %>
<%- if filter_by_stable_target_id %>
<<and_()>>merged_node.identifier = $stable_target_id
<%- endif %>
<<and_()>>ANY(label IN labels(extracted_or_rule_node) WHERE label IN $labels)
ANY(label IN labels(extracted_or_rule_node) WHERE label IN $labels)
<%- if filter_by_stable_target_id %>
AND merged_node.identifier = $stable_target_id
<%- endif %>
<%- endif %>
<%- endblock %>
RETURN COUNT(extracted_or_rule_node) AS total
}
CALL () {
<<-self.match_clause()>>
WITH extracted_or_rule_node
CALL (extracted_or_rule_node) {
OPTIONAL MATCH (extracted_or_rule_node)-[r]->(referenced_merged_node:<<merged_labels|join("|")>>)
RETURN CASE WHEN referenced_merged_node IS NOT NULL THEN {
label: type(r),
position: r.position,
value: referenced_merged_node.identifier
} ELSE NULL END AS ref
UNION
OPTIONAL MATCH (extracted_or_rule_node)-[r]->(referenced_nested_node:<<nested_labels|join("|")>>)
RETURN CASE WHEN referenced_nested_node IS NOT NULL THEN {
label: type(r),
position: r.position,
value: properties(referenced_nested_node)
} ELSE NULL END AS ref
}
WITH extracted_or_rule_node, collect(ref) AS refs
RETURN extracted_or_rule_node{.*, entityType: head(labels(extracted_or_rule_node)), _refs: refs}
ORDER BY extracted_or_rule_node.identifier, extracted_or_rule_node.entityType ASC
ORDER BY extracted_or_rule_node.identifier, head(labels(extracted_or_rule_node)) ASC
SKIP $skip
LIMIT $limit
WITH
extracted_or_rule_node,
[
(extracted_or_rule_node)-[r]->(referenced_merged_node:<<merged_labels|join("|")>>) |
{value: referenced_merged_node.identifier, position:r.position, label: type(r)}
] + [
(extracted_or_rule_node)-[r]->(referenced_nested_node:<<nested_labels|join("|")>>) |
{value: properties(referenced_nested_node), position:r.position , label: type(r)}
] AS refs
WITH
collect(
extracted_or_rule_node{.*, entityType: head(labels(extracted_or_rule_node)), _refs: refs}
) AS items
RETURN items
}
RETURN collect(extracted_or_rule_node) AS items, total;
RETURN items, total;
86 changes: 52 additions & 34 deletions mex/backend/graph/cypher/fetch_merged_items.cql
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Returns:
items: List of merged items, each item has the following attributes:
entityType: the type of the merged item
identifier: the identifier of the merged item
components: The rule and / or extracted items with the stableTargetId of this
_components: The rule and / or extracted items with the stableTargetId of this
merged item. Each component has an extra attribute `_refs` that contains the
values of nested objects as well as the identifiers of referenced items.
-#>
Expand All @@ -24,45 +24,63 @@ CALL () {
<%- if filter_by_query_string %>
OPTIONAL CALL db.index.fulltext.queryNodes("search_index", $query_string)
YIELD node AS hit, score
<%- endif %>
OPTIONAL MATCH (extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
<%- set and_ = joiner("AND ") %>
WHERE
<%- if filter_by_query_string %>
<<and_()>>elementId(hit) = elementId(extracted_or_rule_node)
<%- endif %>
<%- if filter_by_identifier %>
<<and_()>>merged_node.identifier = $identifier
<%- endif %>
<<and_()>>ANY(label IN labels(merged_node) WHERE label IN $labels)
CALL (hit) {
MATCH (extracted_or_rule_node:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
WHERE
elementId(hit) = elementId(extracted_or_rule_node)
AND ANY(label IN labels(merged_node) WHERE label IN $labels)
<%- if filter_by_identifier %>
AND merged_node.identifier = $identifier
<%- endif %>
RETURN merged_node
UNION
MATCH (nested_node:<<nested_labels|join("|")>>)<-[]-(:<<extracted_or_rule_labels|join("|")>>)-[:stableTargetId]->(merged_node:<<merged_labels|join("|")>>)
WHERE
elementId(hit) = elementId(nested_node)
AND ANY(label IN labels(merged_node) WHERE label IN $labels)
<%- if filter_by_identifier %>
AND merged_node.identifier = $identifier
<%- endif %>
RETURN merged_node
}
WITH DISTINCT merged_node AS merged_node
<%- else %>
OPTIONAL MATCH (merged_node:<<merged_labels|join("|")>>)
WHERE
ANY(label IN labels(merged_node) WHERE label IN $labels)
<%- if filter_by_identifier %>
AND merged_node.identifier = $identifier
<%- endif %>
<%- endif %>
<%- endblock %>
RETURN COUNT(merged_node) AS total
}
CALL () {
<<-self.match_clause()>>
OPTIONAL MATCH (extracted_or_rule_node)-[:stableTargetId]->(merged_node)
WITH extracted_or_rule_node, merged_node
CALL (extracted_or_rule_node) {
OPTIONAL MATCH (extracted_or_rule_node)-[r]->(referenced_merged_node:<<merged_labels|join("|")>>)
RETURN CASE WHEN referenced_merged_node IS NOT NULL THEN {
label: type(r),
position: r.position,
value: referenced_merged_node.identifier
} ELSE NULL END AS ref
UNION
OPTIONAL MATCH (extracted_or_rule_node)-[r]->(referenced_nested_node:<<nested_labels|join("|")>>)
RETURN CASE WHEN referenced_nested_node IS NOT NULL THEN {
label: type(r),
position: r.position,
value: properties(referenced_nested_node)
} ELSE NULL END AS ref
}
WITH merged_node, extracted_or_rule_node, collect(ref) AS refs
ORDER BY merged_node.identifier, extracted_or_rule_node.identifier, head(labels(extracted_or_rule_node)) ASC
WITH merged_node, collect(extracted_or_rule_node{.*, entityType: head(labels(extracted_or_rule_node)), _refs: refs}) AS extracted_or_rule_node
RETURN merged_node{entityType: head(labels(merged_node)), identifier: merged_node.identifier, components: extracted_or_rule_node}
ORDER BY merged_node.identifier, head(labels(merged_node)) ASC
SKIP $skip
LIMIT $limit
OPTIONAL MATCH (extracted_or_rule_node)-[:stableTargetId]->(merged_node)
ORDER BY extracted_or_rule_node.identifier, head(labels(extracted_or_rule_node)) ASC
WITH
extracted_or_rule_node,
merged_node,
[
(extracted_or_rule_node)-[r]->(referenced_merged_node:<<merged_labels|join("|")>>) |
{value: referenced_merged_node.identifier, position:r.position, label: type(r)}
] + [
(extracted_or_rule_node)-[r]->(referenced_nested_node:<<nested_labels|join("|")>>) |
{value: properties(referenced_nested_node), position:r.position , label: type(r)}
] AS refs
WITH
merged_node,
collect(
extracted_or_rule_node{.*, entityType: head(labels(extracted_or_rule_node)), _refs: refs}
) AS components
WITH
collect(
merged_node{.*, entityType: head(labels(merged_node)), _components: components}
) AS items
RETURN items
}
RETURN collect(merged_node) AS items, total;
RETURN items, total;
4 changes: 2 additions & 2 deletions mex/backend/merged/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ def merge_search_result_item(
"""
extracted_items = [
EXTRACTED_MODEL_ADAPTER.validate_python(component)
for component in item["components"]
for component in item["_components"]
if component["entityType"] in EXTRACTED_MODEL_CLASSES_BY_NAME
]
raw_rules = [
component
for component in item["components"]
for component in item["_components"]
if component["entityType"] in RULE_MODEL_CLASSES_BY_NAME
]
if raw_rules:
Expand Down
8 changes: 1 addition & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
ExtractedPrimarySource,
OrganizationalUnitRuleSetRequest,
OrganizationalUnitRuleSetResponse,
PreventiveOrganizationalUnit,
SubtractiveOrganizationalUnit,
)
from mex.common.settings import BaseSettings
from mex.common.transform import MExEncoder
Expand Down Expand Up @@ -365,11 +363,7 @@ def additive_organizational_unit(
def organizational_unit_rule_set_request(
additive_organizational_unit: AdditiveOrganizationalUnit,
) -> OrganizationalUnitRuleSetRequest:
return OrganizationalUnitRuleSetRequest(
additive=additive_organizational_unit,
preventive=PreventiveOrganizationalUnit(),
subtractive=SubtractiveOrganizationalUnit(),
)
return OrganizationalUnitRuleSetRequest(additive=additive_organizational_unit)


@pytest.fixture
Expand Down
Loading
Loading