diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index 90a11151..93ef2996 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -213,3 +213,16 @@ def is_edmo(val): "days": 1, } """ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'.""" + +VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None +"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs. + +The path can be specified as either an absolute path or a relative path within the +Flask app instance folder (i.e. ``current_app.instance_path``). + +The CSV file should have the following columns: + +- `org_scheme`: The ORCiD organization ID. +- `org_id`: The ORCiD organization ID. +- `aff_id`: The affiliation ID to map to. +""" diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py index 9929d173..7a37997c 100644 --- a/invenio_vocabularies/contrib/names/datastreams.py +++ b/invenio_vocabularies/contrib/names/datastreams.py @@ -13,12 +13,13 @@ import tarfile from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta +from pathlib import Path import arrow import regex as re from flask import current_app from invenio_access.permissions import system_identity -from invenio_records.dictutils import dict_lookup +from werkzeug.utils import cached_property from invenio_vocabularies.contrib.names.s3client import S3OrcidClient @@ -48,9 +49,8 @@ def _fetch_orcid_data(self, orcid_to_sync, bucket): key = f"{suffix}/{orcid_to_sync}.xml" try: return self.s3_client.read_file(f"s3://{bucket}/{key}") - except Exception as e: - # TODO: log - return None + except Exception: + current_app.logger.exception("Failed to fetch ORCiD record.") def _process_lambda_file(self, fileobj): """Process the ORCiD lambda file and returns a list of ORCiDs to sync. @@ -139,24 +139,75 @@ def __init__(self, *args, test_mode=True, **kwargs): DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']" -"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis.""" +"""Regex to filter out names with punctuation, symbols, numbers and emojis.""" + + +class OrcidOrgToAffiliationMapper: + """Default ORCiD Org ID to affiliation ID mapper.""" + + def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None): + """Constructor.""" + self._org_ids_mapping = org_ids_mapping + self._org_ids_mapping_file = org_ids_mapping_file + + @cached_property + def org_ids_mapping(self): + """Mapping of ORCiD org IDs to affiliation IDs.""" + org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get( + "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH" + ) + if org_ids_mapping_file: + org_ids_mapping_file = Path(org_ids_mapping_file) + # If the path is relative, prepend the instance path + if not org_ids_mapping_file.is_absolute(): + org_ids_mapping_file = ( + Path(current_app.instance_path) / org_ids_mapping_file + ) + with open(org_ids_mapping_file) as fin: + result = {} + reader = csv.reader(fin) + + # Check if the first row is a header + org_scheme, org_id, aff_id = next(reader) + if org_scheme.lower() != "org_scheme": + result[(org_scheme, org_id)] = aff_id + + for org_scheme, org_id, aff_id in reader: + result[(org_scheme, org_id)] = aff_id + + return result + + return self._org_ids_mapping or {} + + def __call__(self, org_scheme, org_id): + """Map an ORCiD org ID to an affiliation ID.""" + # By default we know that ROR IDs are linkable + if org_scheme == "ROR": + return org_id.split("/")[-1] + # Otherwise see if we have a mapping from other schemes to an affiliation ID + return self.org_ids_mapping.get((org_scheme, org_id)) class OrcidTransformer(BaseTransformer): """Transforms an ORCiD record into a names record.""" def __init__( - self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs + self, + *args, + names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, + org_id_to_affiliation_id_func=None, + **kwargs, ) -> None: """Constructor.""" self._names_exclude_regex = names_exclude_regex + self._org_id_to_affiliation_id_func = ( + org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper() + ) super().__init__() - def _is_valid_name(self, name): - """Check whether the name passes the regex.""" - if not self._names_exclude_regex: - return True - return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1)) + def org_id_to_affiliation_id(self, org_scheme, org_id): + """Convert and ORCiD org ID to a linkable affiliation ID.""" + return self._org_id_to_affiliation_id_func(org_scheme, org_id) def apply(self, stream_entry, **kwargs): """Applies the transformation to the stream entry.""" @@ -166,42 +217,88 @@ def apply(self, stream_entry, **kwargs): name = person.get("name") if name is None: - raise TransformerError(f"Name not found in ORCiD entry.") + raise TransformerError("Name not found in ORCiD entry.") if name.get("family-name") is None: - raise TransformerError(f"Family name not found in ORCiD entry.") + raise TransformerError("Family name not found in ORCiD entry.") if not self._is_valid_name(name["given-names"] + name["family-name"]): - raise TransformerError(f"Invalid characters in name.") + raise TransformerError("Invalid characters in name.") entry = { "id": orcid_id, "given_name": name.get("given-names"), "family_name": name.get("family-name"), "identifiers": [{"scheme": "orcid", "identifier": orcid_id}], - "affiliations": [], + "affiliations": self._extract_affiliations(record), } + stream_entry.entry = entry + return stream_entry + + def _is_valid_name(self, name): + """Check whether the name passes the regex.""" + if not self._names_exclude_regex: + return True + return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1)) + + def _extract_affiliations(self, record): + """Extract affiliations from the ORCiD record.""" + result = [] try: - employments = dict_lookup( - record, "activities-summary.employments.affiliation-group" + employments = ( + record.get("activities-summary", {}) + .get("employments", {}) + .get("affiliation-group", []) ) + + # If there are single values, the XML to dict, doesn't wrap them in a list if isinstance(employments, dict): employments = [employments] - history = set() + + # Remove the "employment-summary" nesting + employments = [ + employment.get("employment-summary", {}) for employment in employments + ] + for employment in employments: - terminated = employment["employment-summary"].get("end-date") - affiliation = dict_lookup( - employment, - "employment-summary.organization.name", - ) - if affiliation not in history and not terminated: - history.add(affiliation) - entry["affiliations"].append({"name": affiliation}) + terminated = employment.get("end-date") + if terminated: + continue + + org = employment["organization"] + aff_id = self._extract_affiliation_id(org) + + # Skip adding if the ID already exists in result + if aff_id and any(aff.get("id") == aff_id for aff in result): + continue + + # Skip adding if the name exists in result with no ID + if any( + aff.get("name") == org["name"] and "id" not in aff for aff in result + ): + continue + + aff = {"name": org["name"]} + if aff_id: + aff["id"] = aff_id + + result.append(aff) except Exception: pass - - stream_entry.entry = entry - return stream_entry + return result + + def _extract_affiliation_id(self, org): + """Extract the affiliation ID from an ORCiD organization.""" + dis_org = org.get("disambiguated-organization") + if not dis_org: + return + + aff_id = None + org_id = dis_org.get("disambiguated-organization-identifier") + org_scheme = dis_org.get("disambiguation-source") + if org_id and org_scheme: + aff_id = self.org_id_to_affiliation_id(org_scheme, org_id) + return aff_id class NamesServiceWriter(ServiceWriter): diff --git a/tests/contrib/names/test_names_datastreams.py b/tests/contrib/names/test_names_datastreams.py index 299bb274..c9920947 100644 --- a/tests/contrib/names/test_names_datastreams.py +++ b/tests/contrib/names/test_names_datastreams.py @@ -8,6 +8,8 @@ """Names data streams tests.""" +import os +import tempfile from copy import deepcopy from unittest.mock import patch @@ -23,6 +25,7 @@ ) from invenio_vocabularies.datastreams import StreamEntry from invenio_vocabularies.datastreams.errors import TransformerError, WriterError +from invenio_vocabularies.datastreams.transformers import XMLTransformer @pytest.fixture(scope="function") @@ -43,52 +46,115 @@ def name_full_data(): } -@pytest.fixture(scope="module") -def expected_from_xml(): - return { - "id": "0000-0001-8135-3489", - "given_name": "Lars Holm", - "family_name": "Nielsen", - "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}], - "affiliations": [{"name": "CERN"}], - } - - -XML_ENTRY_DATA = bytes( - '\n' - '\n' - " \n" - " https://orcid.org/0000-0001-8135-3489\n" # noqa - " 0000-0001-8135-3489\n" - " orcid.org\n" - " \n" - ' \n' - ' \n' # noqa - " Lars Holm" # noqa - " Nielsen\n" # noqa - " \n" - ' \n' # noqa - " \n" - ' \n' # noqa - ' \n' # noqa - " \n" - " \n" - " \n" - " CERN\n" - " \n" - " \n" - " \n" - " \n" - " \n" - "\n", - encoding="raw_unicode_escape", -) - - -@pytest.fixture(scope="function") -def bytes_xml_data(): - # simplified version of an XML file of the ORCiD dump - return XML_ENTRY_DATA +# NOTE: This is a simplified version of the original XML data from ORCiD. Sections like +# works, funding, educations, etc. and attributes have been removed. +XML_ENTRY_DATA = b""" + + + https://orcid.org/0000-0001-8135-3489 + 0000-0001-8135-3489 + orcid.org + + + + Lars Holm + Nielsen + + + + + + + + 2012 + 03 + 16 + + + CERN + + https://ror.org/01ggx4157 + + ROR + + + + + + + + 2024 + 01 + + + ACME Inc. + + grid.123456.z + GRID + + + + + + + + 2007 + 08 + + + 2012 + 03 + + + European Southern Observatory + + grid.424907.c + GRID + + + + + + + +""" + +XML_ENTRY_DATA_SINGLE_EMPLOYMENT = b""" + + + https://orcid.org/0000-0001-8135-3489 + 0000-0001-8135-3489 + orcid.org + + + + Lars Holm + Nielsen + + + + + + + + 2012 + 03 + 16 + + + CERN + + https://ror.org/01ggx4157 + + ROR + + + + + + + +""" NAMES_TEST = { @@ -101,69 +167,206 @@ def bytes_xml_data(): @pytest.fixture(scope="function") -def dict_xml_entry(): - return StreamEntry( +def orcid_data(): + base = { + "orcid-identifier": { + "uri": "https://orcid.org/0000-0001-8135-3489", + "path": "0000-0001-8135-3489", + "host": "orcid.org", + }, + "person": { + "name": {"given-names": "Lars Holm", "family-name": "Nielsen"}, + }, + } + + employments = [ + { + "employment-summary": { + "organization": { + "name": "CERN", + "disambiguated-organization": { + "disambiguated-organization-identifier": "https://ror.org/01ggx4157", + "disambiguation-source": "ROR", + }, + }, + "start-date": {"year": "2012", "month": "03", "day": "16"}, + } + }, { - "orcid-identifier": { - "uri": "https://orcid.org/0000-0001-8135-3489", - "path": "0000-0001-8135-3489", - "host": "orcid.org", + "employment-summary": { + "organization": { + "name": "ACME Inc.", + "disambiguated-organization": { + "disambiguated-organization-identifier": "grid.123456.z", + "disambiguation-source": "GRID", + }, + }, + "start-date": {"year": "2024", "month": "01"}, + } + }, + { + "employment-summary": { + "organization": { + "name": "European Southern Observatory", + "disambiguated-organization": { + "disambiguated-organization-identifier": "grid.424907.c", + "disambiguation-source": "GRID", + }, + }, + "start-date": {"year": "2007", "month": "08"}, + "end-date": {"year": "2012", "month": "03"}, }, - "person": { - "name": { - "given-names": "Lars Holm", - "family-name": "Nielsen", - "@visibility": "public", - "@path": "0000-0001-8135-3489", + }, + ] + + return { + "xml": { + "multi_employment": XML_ENTRY_DATA, + "single_employment": XML_ENTRY_DATA_SINGLE_EMPLOYMENT, + }, + "json": { + "multi_employment": { + **base, + "activities-summary": { + "employments": {"affiliation-group": employments}, }, - "external-identifiers": { - "@path": "/0000-0001-8135-3489/external-identifiers" + }, + "single_employment": { + **base, + "activities-summary": { + # NOTE: Because the XML data has only one employment, the + # transformer will not create a list of employments, but instead a + # single employment object. + "employments": {"affiliation-group": employments[0]} }, - "@path": "/0000-0001-8135-3489/person", }, - "activities-summary": { - "employments": { - "affiliation-group": { - "employment-summary": {"organization": {"name": "CERN"}} - }, - "@path": "/0000-0001-8135-3489/employments", + "duplicate_affiliations": { + **base, + "activities-summary": { + "employments": {"affiliation-group": employments + employments}, }, - "@path": "/0000-0001-8135-3489/activities", }, - "@path": "/0000-0001-8135-3489", - } - ) + }, + } -def test_orcid_transformer(dict_xml_entry, expected_from_xml): +@pytest.fixture(scope="module") +def expected_from_xml(): + base = { + "id": "0000-0001-8135-3489", + "given_name": "Lars Holm", + "family_name": "Nielsen", + "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}], + } + + return { + "multi_employment": { + **base, + "affiliations": [ + {"id": "01ggx4157", "name": "CERN"}, + # NOTE: GRID identifiers do not result in linked affiliations + {"name": "ACME Inc."}, + # NOTE: terminated employments are not included in the affiliations + ], + }, + "single_employment": { + **base, + "affiliations": [{"id": "01ggx4157", "name": "CERN"}], + }, + "duplicate_affiliations": { + **base, + "affiliations": [ + # Affiliations are deduplicated + {"id": "01ggx4157", "name": "CERN"}, + {"name": "ACME Inc."}, + ], + }, + } + + +@pytest.fixture(scope="function") +def org_ids_mapping_file_config(app): + """ORCiD organization IDs mappings CSV file.""" + fout = tempfile.NamedTemporaryFile(mode="w", delete=False) + fout.write('"GRID","grid.123456.z","acme_inc_id"\n') + fout.close() + + old_config = app.config.get("VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH") + app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = fout.name + yield + + app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = old_config + os.unlink(fout.name) + + +def test_orcid_xml_transform(orcid_data): + """Test XML transformer with ORCiD record. + + NOTE: this might look somewhat "redundant", since we're again testing if the + XMLTransformer works as expected. However, this is a more specific test + that demonstrates how the transformer behaves with more complex XML data, that + can have e.g. nested elements that can be arrays or objects. + """ + xml_data = orcid_data["xml"] + json_data = orcid_data["json"] + + for key, data in xml_data.items(): + transformer = XMLTransformer(root_element="record") + transform_result = transformer.apply(StreamEntry(data)).entry + assert transform_result == json_data[key] + + +def test_orcid_transformer(app, orcid_data, expected_from_xml): + """Test ORCiD transformer data.""" transformer = OrcidTransformer() - assert expected_from_xml == transformer.apply(dict_xml_entry).entry + input_data = orcid_data["json"] + + for key, data in input_data.items(): + assert expected_from_xml[key] == transformer.apply(StreamEntry(data)).entry, key + + +def test_orcid_transformer_org_ids_mapping_from_file( + app, + orcid_data, + expected_from_xml, + org_ids_mapping_file_config, +): + """Test ORCiD transformer data with org IDs mapping file.""" + transformer = OrcidTransformer() + input_data = orcid_data["json"]["multi_employment"] + expected = deepcopy(expected_from_xml["multi_employment"]) + expected["affiliations"] = [ + {"id": "01ggx4157", "name": "CERN"}, + {"id": "acme_inc_id", "name": "ACME Inc."}, + ] + + assert expected == transformer.apply(StreamEntry(input_data)).entry @pytest.mark.parametrize("name,is_valid_name", NAMES_TEST.items()) -def test_orcid_transformer_different_names(dict_xml_entry, name, is_valid_name): +def test_orcid_transformer_name_filtering(orcid_data, name, is_valid_name): transformer = OrcidTransformer() - val = deepcopy(dict_xml_entry) + val = deepcopy(orcid_data["json"]["multi_employment"]) if is_valid_name: - assert transformer.apply(val).entry + assert transformer.apply(StreamEntry(val)).entry else: with pytest.raises(TransformerError): - val.entry["person"]["name"]["given-names"] = name - val.entry["person"]["name"]["family-name"] = "" - transformer.apply(val) + val["person"]["name"]["given-names"] = name + val["person"]["name"]["family-name"] = "" + transformer.apply(StreamEntry(val)) with pytest.raises(TransformerError): - val.entry["person"]["name"]["given-names"] = "" - val.entry["person"]["name"]["family-name"] = name - transformer.apply(val) + val["person"]["name"]["given-names"] = "" + val["person"]["name"]["family-name"] = name + transformer.apply(StreamEntry(val)) @pytest.mark.parametrize("name", NAMES_TEST.keys()) -def test_orcid_transformer_different_names_no_regex(dict_xml_entry, name): +def test_orcid_transformer_different_names_no_regex(orcid_data, name): transformer = OrcidTransformer(names_exclude_regex=None) - val = deepcopy(dict_xml_entry) - val.entry["person"]["name"]["given-names"] = name - val.entry["person"]["name"]["family-name"] = "" - assert transformer.apply(val).entry + val = deepcopy(orcid_data["json"]["multi_employment"]) + val["person"]["name"]["given-names"] = name + val["person"]["name"]["family-name"] = "" + assert transformer.apply(StreamEntry(val)).entry class MockResponse: @@ -172,14 +375,14 @@ class MockResponse: @patch("requests.get", side_effect=lambda url, headers: MockResponse()) -def test_orcid_http_reader(_, bytes_xml_data): +def test_orcid_http_reader(_): reader = OrcidHTTPReader(id="0000-0001-8135-3489") results = [] for entry in reader.read(): results.append(entry) assert len(results) == 1 - assert bytes_xml_data == results[0] + assert XML_ENTRY_DATA == results[0] def test_names_service_writer_create(app, db, search_clear, name_full_data): diff --git a/tests/datastreams/test_transformers.py b/tests/datastreams/test_transformers.py index 8c5b1943..dd7628d7 100644 --- a/tests/datastreams/test_transformers.py +++ b/tests/datastreams/test_transformers.py @@ -18,26 +18,51 @@ @pytest.fixture(scope="module") def expected_from_xml(): return { - "record": { - "field_one": "value", - "multi_field": {"some": "value", "another": "value too"}, - } + "top_level_field": "top-level single value", + "top_level_object_field": { + "some": "value", + "another": "value too", + "nested_array_field": { + "@array_attr": "value", + "array_element_object": [ + { + "@obj_attr": "first", + "element_foo": "value1", + "element_bar": "value1", + }, + { + "@obj_attr": "second", + "element_foo": "value2", + "element_bar": "value2", + "element_qux": "value2", + }, + ], + }, + }, } def test_xml_transformer(expected_from_xml): bytes_xml_entry = StreamEntry( - bytes( - '\n' - "\n" - " value\n" - " \n" - " value\n" - " value too\n" - " \n" - "\n", - encoding="raw_unicode_escape", - ) + b""" + + top-level single value + + value + value too + + + value1 + value1 + + + value2 + value2 + value2 + + + + """ ) transformer = XMLTransformer() @@ -46,15 +71,25 @@ def test_xml_transformer(expected_from_xml): def test_bad_xml_transformer(): bytes_xml_entry = StreamEntry( - bytes( - '\n' - "value\n" - "\n" - " value\n" - " value too\n" - "\n", - encoding="raw_unicode_escape", - ) + b""" + + top-level single value + + value + value too + + + value1 + value1 + + + value2 + value2 + value2 + + + + """ ) transformer = XMLTransformer(root_element="field_two")