From 92eabb1a77967439a41bc54ae17a0f26a2259067 Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Mon, 25 Nov 2024 11:06:20 +0100 Subject: [PATCH] names: support mapping affiliation IDs * Allows providing a CSV file to the ORCiD transformer for mapping organization identifiers enountered in the ORCiD data to linkable affiliation IDs. --- invenio_vocabularies/config.py | 10 + .../contrib/names/datastreams.py | 121 +++++-- tests/contrib/names/test_names_datastreams.py | 298 +++++++++++++----- tests/datastreams/test_transformers.py | 83 +++-- 4 files changed, 382 insertions(+), 130 deletions(-) diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index 90a11151..417ed8db 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -213,3 +213,13 @@ def is_edmo(val): "days": 1, } """ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'.""" + +VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None +"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs. + +The CSV file should have the following columns: + +- `org_scheme`: The ORCiD organization ID. +- `org_id`: The ORCiD organization ID. +- `aff_id`: The affiliation ID to map to. +""" diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py index 8299336e..267116d9 100644 --- a/invenio_vocabularies/contrib/names/datastreams.py +++ b/invenio_vocabularies/contrib/names/datastreams.py @@ -13,11 +13,13 @@ import tarfile from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta +from pathlib import Path import arrow import regex as re from flask import current_app from invenio_access.permissions import system_identity +from werkzeug.utils import cached_property from invenio_vocabularies.contrib.names.s3client import S3OrcidClient @@ -141,6 +143,52 @@ def __init__(self, *args, test_mode=True, **kwargs): """Regex to filter out names with punctuation, symbols, numbers and emojis.""" +class OrcidOrgToAffiliationMapper: + """Default ORCiD Org ID to affiliation ID mapper.""" + + def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None): + """Constructor.""" + self._org_ids_mapping = org_ids_mapping + self._org_ids_mapping_file = org_ids_mapping_file + + @cached_property + def org_ids_mapping(self): + """Mapping of ORCiD org IDs to affiliation IDs.""" + org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get( + "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH" + ) + if org_ids_mapping_file: + org_ids_mapping_file = Path(org_ids_mapping_file) + # If the path is relative, prepend the instance path + if not org_ids_mapping_file.is_absolute(): + org_ids_mapping_file = ( + Path(current_app.instance_path) / org_ids_mapping_file + ) + with open(org_ids_mapping_file) as fin: + result = {} + reader = csv.reader(fin) + + # Check if the first row is a header + org_scheme, org_id, aff_id = next(reader) + if org_scheme.lower() != "org_scheme": + result[(org_scheme, org_id)] = aff_id + + for org_scheme, org_id, aff_id in reader: + result[(org_scheme, org_id)] = aff_id + + return result + + return self._org_ids_mapping or {} + + def __call__(self, org_scheme, org_id): + """Map an ORCiD org ID to an affiliation ID.""" + # By default we know that ROR IDs are linkable + if org_scheme == "ROR": + return org_id.split("/")[-1] + # Otherwise see if we have a mapping from other schemes to an affiliation ID + return self.org_ids_mapping.get((org_scheme, org_id)) + + class OrcidTransformer(BaseTransformer): """Transforms an ORCiD record into a names record.""" @@ -148,30 +196,19 @@ def __init__( self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, - affiliation_relation_schemes=None, - org_scheme_mappping=None, + org_id_to_affiliation_id_func=None, **kwargs, ) -> None: """Constructor.""" self._names_exclude_regex = names_exclude_regex - self._affiliation_relation_schemes = affiliation_relation_schemes - self._org_scheme_mappping = org_scheme_mappping + self._org_id_to_affiliation_id_func = ( + org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper() + ) super().__init__() - @property - def affiliation_relation_schemes(self): - """Allowed affiliation schemes.""" - return self._affiliation_relation_schemes or {"ror"} - - @property - def org_scheme_mappping(self): - """Mapping of ORCiD org schemes to affiliation schemes.""" - return self._org_scheme_mappping or { - "GRID": "grid", - "ROR": "ror", - "ISNI": "isni", - "FUNDREF": "fundref", - } + def org_id_to_affiliation_id(self, org_scheme, org_id): + """Convert and ORCiD org ID to a linkable affiliation ID.""" + return self._org_id_to_affiliation_id_func(org_scheme, org_id) def apply(self, stream_entry, **kwargs): """Applies the transformation to the stream entry.""" @@ -212,36 +249,52 @@ def _extract_affiliations(self, record): employments = ( record.get("activities-summary", {}) .get("employments", {}) - .get("affiliation-group") + .get("affiliation-group", []) ) + + # If there are single values, the XML to dict, doesn't wrap them in a list if isinstance(employments, dict): employments = [employments] + # Remove the "employment-summary" nesting + employments = [ + employment.get("employment-summary", {}) for employment in employments + ] + history = set() for employment in employments: - terminated = employment["employment-summary"].get("end-date") - org = employment["employment-summary"]["organization"] - if org["name"] not in history and not terminated: - history.add(org["name"]) - aff = {"name": org["name"]} + terminated = employment.get("end-date") + org = employment["organization"] - if org.get("disambiguated-organization"): - dis_org = org["disambiguated-organization"] - org_id = dis_org.get("disambiguated-organization-identifier") - org_scheme = dis_org.get("disambiguation-source") - aff_scheme = self.org_scheme_mappping.get(org_scheme) + if terminated or org["name"] in history: + continue - if org_id and aff_scheme in self.affiliation_relation_schemes: - if aff_scheme == "ror": - org_id = org_id.split("/")[-1] + history.add(org["name"]) + aff = {"name": org["name"]} - aff["id"] = org_id + # Extract the org ID, to link to the affiliation vocabulary + aff_id = self._extract_affiliation_id(org) + if aff_id: + aff["id"] = aff_id - result.append(aff) + result.append(aff) except Exception: pass return result + def _extract_affiliation_id(self, org): + """Extract the affiliation ID from an ORCiD organization.""" + dis_org = org.get("disambiguated-organization") + if not dis_org: + return + + aff_id = None + org_id = dis_org.get("disambiguated-organization-identifier") + org_scheme = dis_org.get("disambiguation-source") + if org_id and org_scheme: + aff_id = self.org_id_to_affiliation_id(org_scheme, org_id) + return aff_id + class NamesServiceWriter(ServiceWriter): """Names service writer.""" diff --git a/tests/contrib/names/test_names_datastreams.py b/tests/contrib/names/test_names_datastreams.py index 46c89cc6..3abd82ee 100644 --- a/tests/contrib/names/test_names_datastreams.py +++ b/tests/contrib/names/test_names_datastreams.py @@ -8,6 +8,8 @@ """Names data streams tests.""" +import os +import tempfile from copy import deepcopy from unittest.mock import patch @@ -23,6 +25,7 @@ ) from invenio_vocabularies.datastreams import StreamEntry from invenio_vocabularies.datastreams.errors import TransformerError, WriterError +from invenio_vocabularies.datastreams.transformers import XMLTransformer @pytest.fixture(scope="function") @@ -43,17 +46,8 @@ def name_full_data(): } -@pytest.fixture(scope="module") -def expected_from_xml(): - return { - "id": "0000-0001-8135-3489", - "given_name": "Lars Holm", - "family_name": "Nielsen", - "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}], - "affiliations": [{"id": "01ggx4157", "name": "CERN"}], - } - - +# NOTE: This is a simplified version of the original XML data from ORCiD. Sections like +# works, funding, educations, etc. and attributes have been removed. XML_ENTRY_DATA = b""" @@ -66,7 +60,6 @@ def expected_from_xml(): Lars Holm Nielsen - @@ -89,6 +82,21 @@ def expected_from_xml(): + + 2024 + 01 + + + ACME Inc. + + grid.123456.z + GRID + + + + + + 2007 08 @@ -100,22 +108,53 @@ def expected_from_xml(): European Southern Observatory - 54249 - RINGGOLD + grid.424907.c + GRID - + """ - -@pytest.fixture(scope="function") -def bytes_xml_data(): - # simplified version of an XML file of the ORCiD dump - return XML_ENTRY_DATA +XML_ENTRY_DATA_SINGLE_EMPLOYMENT = b""" + + + https://orcid.org/0000-0001-8135-3489 + 0000-0001-8135-3489 + orcid.org + + + + Lars Holm + Nielsen + + + + + + + + 2012 + 03 + 16 + + + CERN + + https://ror.org/01ggx4157 + + ROR + + + + + + + +""" NAMES_TEST = { @@ -128,77 +167,192 @@ def bytes_xml_data(): @pytest.fixture(scope="function") -def dict_xml_entry(): - return StreamEntry( +def orcid_data(): + base = { + "orcid-identifier": { + "uri": "https://orcid.org/0000-0001-8135-3489", + "path": "0000-0001-8135-3489", + "host": "orcid.org", + }, + "person": { + "name": {"given-names": "Lars Holm", "family-name": "Nielsen"}, + }, + } + + employments = [ { - "orcid-identifier": { - "uri": "https://orcid.org/0000-0001-8135-3489", - "path": "0000-0001-8135-3489", - "host": "orcid.org", - }, - "person": { - "name": { - "given-names": "Lars Holm", - "family-name": "Nielsen", - "@visibility": "public", - "@path": "0000-0001-8135-3489", + "employment-summary": { + "organization": { + "name": "CERN", + "disambiguated-organization": { + "disambiguated-organization-identifier": "https://ror.org/01ggx4157", + "disambiguation-source": "ROR", + }, }, - "external-identifiers": { - "@path": "/0000-0001-8135-3489/external-identifiers" + "start-date": {"year": "2012", "month": "03", "day": "16"}, + } + }, + { + "employment-summary": { + "organization": { + "name": "ACME Inc.", + "disambiguated-organization": { + "disambiguated-organization-identifier": "grid.123456.z", + "disambiguation-source": "GRID", + }, }, - "@path": "/0000-0001-8135-3489/person", - }, - "activities-summary": { - "employments": { - "affiliation-group": { - "employment-summary": { - "organization": { - "name": "CERN", - "disambiguated-organization": { - "disambiguated-organization-identifier": "https://ror.org/01ggx4157", - "disambiguation-source": "ROR", - }, - } - } + "start-date": {"year": "2024", "month": "01"}, + } + }, + { + "employment-summary": { + "organization": { + "name": "European Southern Observatory", + "disambiguated-organization": { + "disambiguated-organization-identifier": "grid.424907.c", + "disambiguation-source": "GRID", }, - "@path": "/0000-0001-8135-3489/employments", }, - "@path": "/0000-0001-8135-3489/activities", + "start-date": {"year": "2007", "month": "08"}, + "end-date": {"year": "2012", "month": "03"}, + }, + }, + ] + + return { + "xml": { + "multi_employment": XML_ENTRY_DATA, + "single_employment": XML_ENTRY_DATA_SINGLE_EMPLOYMENT, + }, + "json": { + "multi_employment": { + **base, + "activities-summary": { + "employments": {"affiliation-group": employments}, + }, + }, + "single_employment": { + **base, + "activities-summary": { + # NOTE: Because the XML data has only one employment, the + # transformer will not create a list of employments, but instead a + # single employment object. + "employments": {"affiliation-group": employments[0]} + }, }, - "@path": "/0000-0001-8135-3489", - } - ) + }, + } + + +@pytest.fixture(scope="module") +def expected_from_xml(): + base = { + "id": "0000-0001-8135-3489", + "given_name": "Lars Holm", + "family_name": "Nielsen", + "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}], + } + + return { + "multi_employment": { + **base, + "affiliations": [ + {"id": "01ggx4157", "name": "CERN"}, + # NOTE: GRID identifiers do not result in linked affiliations + {"name": "ACME Inc."}, + # NOTE: terminated employments are not included in the affiliations + ], + }, + "single_employment": { + **base, + "affiliations": [{"id": "01ggx4157", "name": "CERN"}], + }, + } + + +@pytest.fixture(scope="function") +def org_ids_mapping_file_config(app): + """ORCiD organization IDs mappings CSV file.""" + fout = tempfile.NamedTemporaryFile(mode="w", delete=False) + fout.write('"GRID","grid.123456.z","acme_inc_id"\n') + fout.close() + old_config = app.config.get("VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH") + app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = fout.name + yield -def test_orcid_transformer(dict_xml_entry, expected_from_xml): + app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = old_config + os.unlink(fout.name) + + +def test_orcid_xml_transform(orcid_data): + """Test XML transformer with ORCiD record. + + NOTE: this might look somewhat "redundant", since we're again testing if the + XMLTransformer works as expected. However, this is a more specific test + that demonstrates how the transformer behaves with more complex XML data, that + can have e.g. nested elements that can be arrays or objects. + """ + xml_data = orcid_data["xml"] + json_data = orcid_data["json"] + + for key, data in xml_data.items(): + transformer = XMLTransformer(root_element="record") + transform_result = transformer.apply(StreamEntry(data)).entry + assert transform_result == json_data[key] + + +def test_orcid_transformer(app, orcid_data, expected_from_xml): + """Test ORCiD transformer data.""" transformer = OrcidTransformer() - assert expected_from_xml == transformer.apply(dict_xml_entry).entry + input_data = orcid_data["json"] + + for key, data in input_data.items(): + assert expected_from_xml[key] == transformer.apply(StreamEntry(data)).entry, key + + +def test_orcid_transformer_org_ids_mapping_from_file( + app, + orcid_data, + expected_from_xml, + org_ids_mapping_file_config, +): + """Test ORCiD transformer data with org IDs mapping file.""" + transformer = OrcidTransformer() + input_data = orcid_data["json"]["multi_employment"] + expected = deepcopy(expected_from_xml["multi_employment"]) + expected["affiliations"] = [ + {"id": "01ggx4157", "name": "CERN"}, + {"id": "acme_inc_id", "name": "ACME Inc."}, + ] + + assert expected == transformer.apply(StreamEntry(input_data)).entry @pytest.mark.parametrize("name,is_valid_name", NAMES_TEST.items()) -def test_orcid_transformer_name_filtering(dict_xml_entry, name, is_valid_name): +def test_orcid_transformer_name_filtering(orcid_data, name, is_valid_name): transformer = OrcidTransformer() - val = deepcopy(dict_xml_entry) + val = deepcopy(orcid_data["json"]["multi_employment"]) if is_valid_name: - assert transformer.apply(val).entry + assert transformer.apply(StreamEntry(val)).entry else: with pytest.raises(TransformerError): - val.entry["person"]["name"]["given-names"] = name - val.entry["person"]["name"]["family-name"] = "" - transformer.apply(val) + val["person"]["name"]["given-names"] = name + val["person"]["name"]["family-name"] = "" + transformer.apply(StreamEntry(val)) with pytest.raises(TransformerError): - val.entry["person"]["name"]["given-names"] = "" - val.entry["person"]["name"]["family-name"] = name - transformer.apply(val) + val["person"]["name"]["given-names"] = "" + val["person"]["name"]["family-name"] = name + transformer.apply(StreamEntry(val)) @pytest.mark.parametrize("name", NAMES_TEST.keys()) -def test_orcid_transformer_different_names_no_regex(dict_xml_entry, name): +def test_orcid_transformer_different_names_no_regex(orcid_data, name): transformer = OrcidTransformer(names_exclude_regex=None) - val = deepcopy(dict_xml_entry) - val.entry["person"]["name"]["given-names"] = name - val.entry["person"]["name"]["family-name"] = "" - assert transformer.apply(val).entry + val = deepcopy(orcid_data["json"]["multi_employment"]) + val["person"]["name"]["given-names"] = name + val["person"]["name"]["family-name"] = "" + assert transformer.apply(StreamEntry(val)).entry class MockResponse: @@ -207,14 +361,14 @@ class MockResponse: @patch("requests.get", side_effect=lambda url, headers: MockResponse()) -def test_orcid_http_reader(_, bytes_xml_data): +def test_orcid_http_reader(_): reader = OrcidHTTPReader(id="0000-0001-8135-3489") results = [] for entry in reader.read(): results.append(entry) assert len(results) == 1 - assert bytes_xml_data == results[0] + assert XML_ENTRY_DATA == results[0] def test_names_service_writer_create(app, db, search_clear, name_full_data): diff --git a/tests/datastreams/test_transformers.py b/tests/datastreams/test_transformers.py index 8c5b1943..dd7628d7 100644 --- a/tests/datastreams/test_transformers.py +++ b/tests/datastreams/test_transformers.py @@ -18,26 +18,51 @@ @pytest.fixture(scope="module") def expected_from_xml(): return { - "record": { - "field_one": "value", - "multi_field": {"some": "value", "another": "value too"}, - } + "top_level_field": "top-level single value", + "top_level_object_field": { + "some": "value", + "another": "value too", + "nested_array_field": { + "@array_attr": "value", + "array_element_object": [ + { + "@obj_attr": "first", + "element_foo": "value1", + "element_bar": "value1", + }, + { + "@obj_attr": "second", + "element_foo": "value2", + "element_bar": "value2", + "element_qux": "value2", + }, + ], + }, + }, } def test_xml_transformer(expected_from_xml): bytes_xml_entry = StreamEntry( - bytes( - '\n' - "\n" - " value\n" - " \n" - " value\n" - " value too\n" - " \n" - "\n", - encoding="raw_unicode_escape", - ) + b""" + + top-level single value + + value + value too + + + value1 + value1 + + + value2 + value2 + value2 + + + + """ ) transformer = XMLTransformer() @@ -46,15 +71,25 @@ def test_xml_transformer(expected_from_xml): def test_bad_xml_transformer(): bytes_xml_entry = StreamEntry( - bytes( - '\n' - "value\n" - "\n" - " value\n" - " value too\n" - "\n", - encoding="raw_unicode_escape", - ) + b""" + + top-level single value + + value + value too + + + value1 + value1 + + + value2 + value2 + value2 + + + + """ ) transformer = XMLTransformer(root_element="field_two")