Skip to content

Commit

Permalink
contrib: improve search accuracy for names, funders, affiliations
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Nov 26, 2024
1 parent 1cd09ef commit 17d4bf8
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 42 deletions.
28 changes: 18 additions & 10 deletions invenio_vocabularies/contrib/affiliations/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from invenio_i18n import lazy_gettext as _
from invenio_records_resources.services import SearchOptions
from invenio_records_resources.services.records.components import DataComponent
from invenio_records_resources.services.records.params import SuggestQueryParser
from invenio_records_resources.services.records.queryparser import (
CompositeSuggestQueryParser,
)
from werkzeug.local import LocalProxy

from ...services.components import PIDComponent
Expand All @@ -24,23 +26,29 @@
affiliation_edmo_country_mappings = LocalProxy(
lambda: current_app.config["VOCABULARIES_AFFILIATIONS_EDMO_COUNTRY_MAPPING"]
)
localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
localized_title = LocalProxy(lambda: f"title.{get_locale()}^7")


class AffiliationsSearchOptions(SearchOptions):
"""Search options."""

suggest_parser_cls = SuggestQueryParser.factory(
suggest_parser_cls = CompositeSuggestQueryParser.factory(
fields=[
"name^100",
"acronym.keyword^100",
"acronym^40",
# We boost the acronym fields, since they're smaller words and are more
# likely to be used in a query.
"acronym.keyword^50",
"acronym^10",
"name^10",
# Aliases can sometimes be shorter, so we boost them a bit.
"aliases^5",
localized_title,
"id^20",
"aliases^20",
"id^2",
# Allow to search identifiers directly (e.g. ROR)
"identifiers.identifier",
"country",
"country_name",
"types",
],
type="most_fields", # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
fuzziness="AUTO", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
)

sort_default = "bestmatch"
Expand Down
31 changes: 19 additions & 12 deletions invenio_vocabularies/contrib/funders/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from invenio_i18n import lazy_gettext as _
from invenio_records_resources.services import SearchOptions
from invenio_records_resources.services.records.components import DataComponent
from invenio_records_resources.services.records.params import SuggestQueryParser
from invenio_records_resources.services.records.queryparser import (
CompositeSuggestQueryParser,
)
from werkzeug.local import LocalProxy

from ...services.components import ModelPIDComponent
Expand All @@ -23,24 +25,29 @@
funder_fundref_doi_prefix = LocalProxy(
lambda: current_app.config["VOCABULARIES_FUNDER_DOI_PREFIX"]
)
localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
localized_title = LocalProxy(lambda: f"title.{get_locale()}^7")


class FundersSearchOptions(SearchOptions):
"""Search options."""

suggest_parser_cls = SuggestQueryParser.factory(
suggest_parser_cls = CompositeSuggestQueryParser.factory(
fields=[
"name^100",
"acronym.keyword^100",
"acronym^40",
# We boost the acronym fields, since they're smaller words and are more
# likely to be used in a query.
"acronym.keyword^50",
"acronym^10",
"name^10",
# Aliases can sometimes be shorter, so we boost them a bit.
"aliases^5",
localized_title,
"id^20",
"aliases^20",
"identifiers.identifier^10",
],
type="most_fields", # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
fuzziness="AUTO", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
"id^2",
# Allow to search identifiers directly (e.g. ROR)
"identifiers.identifier",
"country",
"country_name",
"types",
]
)

sort_default = "bestmatch"
Expand Down
21 changes: 12 additions & 9 deletions invenio_vocabularies/contrib/names/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
DataComponent,
RelationsComponent,
)
from invenio_records_resources.services.records.params import SuggestQueryParser
from invenio_records_resources.services.records.queryparser import (
CompositeSuggestQueryParser,
)
from werkzeug.local import LocalProxy

from ...services.components import PIDComponent
Expand All @@ -26,16 +28,17 @@
class NamesSearchOptions(SearchOptions):
"""Search options."""

suggest_parser_cls = SuggestQueryParser.factory(
suggest_parser_cls = CompositeSuggestQueryParser.factory(
fields=[
"given_name^100",
"name^70",
"family_name^50",
"identifiers.identifier^20",
"affiliations.name^20",
"name^5",
# We boost the affiliation acronym fields, since they're short and more
# likely to be used in a query.
"affiliations.acronym.keyword^3",
"affiliations.acronym",
"affiliations.name",
# Allow to search identifiers directly (e.g. ORCID)
"identifiers.identifier",
],
type="most_fields", # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
fuzziness="AUTO",
)

sort_default = "bestmatch"
Expand Down
8 changes: 1 addition & 7 deletions invenio_vocabularies/services/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,11 @@
from invenio_i18n import lazy_gettext as _
from invenio_records_resources.services import (
Link,
LinksTemplate,
RecordService,
RecordServiceConfig,
SearchOptions,
pagination_links,
)
from invenio_records_resources.services.base import (
ConditionalLink,
Service,
ServiceListResult,
)
from invenio_records_resources.services.base import ConditionalLink
from invenio_records_resources.services.records.components import DataComponent
from invenio_records_resources.services.records.params import (
FilterParam,
Expand Down
5 changes: 1 addition & 4 deletions tests/contrib/names/test_names_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"""Test the name vocabulary resource."""

import json
from copy import deepcopy

import pytest

Expand Down Expand Up @@ -193,8 +192,6 @@ def test_names_suggest_sort(client_with_credentials, example_multiple_names, h,
# With affiliation
res = client_with_credentials.get(f"{prefix}?suggest=john%20wwe", headers=h)
assert res.status_code == 200
assert (
res.json["hits"]["total"] == 3
) # Will find 3 johns but WWE affiliation should be at the top
assert res.json["hits"]["total"] == 1
assert res.json["hits"]["hits"][0]["name"] == "Cena, John"
assert res.json["hits"]["hits"][0]["affiliations"][0]["name"] == "WWE"

0 comments on commit 17d4bf8

Please sign in to comment.