diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
index 90a11151..93ef2996 100644
--- a/invenio_vocabularies/config.py
+++ b/invenio_vocabularies/config.py
@@ -213,3 +213,16 @@ def is_edmo(val):
"days": 1,
}
"""ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
+
+VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None
+"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs.
+
+The path can be specified as either an absolute path or a relative path within the
+Flask app instance folder (i.e. ``current_app.instance_path``).
+
+The CSV file should have the following columns:
+
+- `org_scheme`: The ORCiD organization ID.
+- `org_id`: The ORCiD organization ID.
+- `aff_id`: The affiliation ID to map to.
+"""
diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py
index 9929d173..7a37997c 100644
--- a/invenio_vocabularies/contrib/names/datastreams.py
+++ b/invenio_vocabularies/contrib/names/datastreams.py
@@ -13,12 +13,13 @@
import tarfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta
+from pathlib import Path
import arrow
import regex as re
from flask import current_app
from invenio_access.permissions import system_identity
-from invenio_records.dictutils import dict_lookup
+from werkzeug.utils import cached_property
from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
@@ -48,9 +49,8 @@ def _fetch_orcid_data(self, orcid_to_sync, bucket):
key = f"{suffix}/{orcid_to_sync}.xml"
try:
return self.s3_client.read_file(f"s3://{bucket}/{key}")
- except Exception as e:
- # TODO: log
- return None
+ except Exception:
+ current_app.logger.exception("Failed to fetch ORCiD record.")
def _process_lambda_file(self, fileobj):
"""Process the ORCiD lambda file and returns a list of ORCiDs to sync.
@@ -139,24 +139,75 @@ def __init__(self, *args, test_mode=True, **kwargs):
DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
-"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
+"""Regex to filter out names with punctuation, symbols, numbers and emojis."""
+
+
+class OrcidOrgToAffiliationMapper:
+ """Default ORCiD Org ID to affiliation ID mapper."""
+
+ def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
+ """Constructor."""
+ self._org_ids_mapping = org_ids_mapping
+ self._org_ids_mapping_file = org_ids_mapping_file
+
+ @cached_property
+ def org_ids_mapping(self):
+ """Mapping of ORCiD org IDs to affiliation IDs."""
+ org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
+ "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
+ )
+ if org_ids_mapping_file:
+ org_ids_mapping_file = Path(org_ids_mapping_file)
+ # If the path is relative, prepend the instance path
+ if not org_ids_mapping_file.is_absolute():
+ org_ids_mapping_file = (
+ Path(current_app.instance_path) / org_ids_mapping_file
+ )
+ with open(org_ids_mapping_file) as fin:
+ result = {}
+ reader = csv.reader(fin)
+
+ # Check if the first row is a header
+ org_scheme, org_id, aff_id = next(reader)
+ if org_scheme.lower() != "org_scheme":
+ result[(org_scheme, org_id)] = aff_id
+
+ for org_scheme, org_id, aff_id in reader:
+ result[(org_scheme, org_id)] = aff_id
+
+ return result
+
+ return self._org_ids_mapping or {}
+
+ def __call__(self, org_scheme, org_id):
+ """Map an ORCiD org ID to an affiliation ID."""
+ # By default we know that ROR IDs are linkable
+ if org_scheme == "ROR":
+ return org_id.split("/")[-1]
+ # Otherwise see if we have a mapping from other schemes to an affiliation ID
+ return self.org_ids_mapping.get((org_scheme, org_id))
class OrcidTransformer(BaseTransformer):
"""Transforms an ORCiD record into a names record."""
def __init__(
- self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
+ self,
+ *args,
+ names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
+ org_id_to_affiliation_id_func=None,
+ **kwargs,
) -> None:
"""Constructor."""
self._names_exclude_regex = names_exclude_regex
+ self._org_id_to_affiliation_id_func = (
+ org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
+ )
super().__init__()
- def _is_valid_name(self, name):
- """Check whether the name passes the regex."""
- if not self._names_exclude_regex:
- return True
- return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+ def org_id_to_affiliation_id(self, org_scheme, org_id):
+ """Convert and ORCiD org ID to a linkable affiliation ID."""
+ return self._org_id_to_affiliation_id_func(org_scheme, org_id)
def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
@@ -166,42 +217,88 @@ def apply(self, stream_entry, **kwargs):
name = person.get("name")
if name is None:
- raise TransformerError(f"Name not found in ORCiD entry.")
+ raise TransformerError("Name not found in ORCiD entry.")
if name.get("family-name") is None:
- raise TransformerError(f"Family name not found in ORCiD entry.")
+ raise TransformerError("Family name not found in ORCiD entry.")
if not self._is_valid_name(name["given-names"] + name["family-name"]):
- raise TransformerError(f"Invalid characters in name.")
+ raise TransformerError("Invalid characters in name.")
entry = {
"id": orcid_id,
"given_name": name.get("given-names"),
"family_name": name.get("family-name"),
"identifiers": [{"scheme": "orcid", "identifier": orcid_id}],
- "affiliations": [],
+ "affiliations": self._extract_affiliations(record),
}
+ stream_entry.entry = entry
+ return stream_entry
+
+ def _is_valid_name(self, name):
+ """Check whether the name passes the regex."""
+ if not self._names_exclude_regex:
+ return True
+ return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+
+ def _extract_affiliations(self, record):
+ """Extract affiliations from the ORCiD record."""
+ result = []
try:
- employments = dict_lookup(
- record, "activities-summary.employments.affiliation-group"
+ employments = (
+ record.get("activities-summary", {})
+ .get("employments", {})
+ .get("affiliation-group", [])
)
+
+ # If there are single values, the XML to dict, doesn't wrap them in a list
if isinstance(employments, dict):
employments = [employments]
- history = set()
+
+ # Remove the "employment-summary" nesting
+ employments = [
+ employment.get("employment-summary", {}) for employment in employments
+ ]
+
for employment in employments:
- terminated = employment["employment-summary"].get("end-date")
- affiliation = dict_lookup(
- employment,
- "employment-summary.organization.name",
- )
- if affiliation not in history and not terminated:
- history.add(affiliation)
- entry["affiliations"].append({"name": affiliation})
+ terminated = employment.get("end-date")
+ if terminated:
+ continue
+
+ org = employment["organization"]
+ aff_id = self._extract_affiliation_id(org)
+
+ # Skip adding if the ID already exists in result
+ if aff_id and any(aff.get("id") == aff_id for aff in result):
+ continue
+
+ # Skip adding if the name exists in result with no ID
+ if any(
+ aff.get("name") == org["name"] and "id" not in aff for aff in result
+ ):
+ continue
+
+ aff = {"name": org["name"]}
+ if aff_id:
+ aff["id"] = aff_id
+
+ result.append(aff)
except Exception:
pass
-
- stream_entry.entry = entry
- return stream_entry
+ return result
+
+ def _extract_affiliation_id(self, org):
+ """Extract the affiliation ID from an ORCiD organization."""
+ dis_org = org.get("disambiguated-organization")
+ if not dis_org:
+ return
+
+ aff_id = None
+ org_id = dis_org.get("disambiguated-organization-identifier")
+ org_scheme = dis_org.get("disambiguation-source")
+ if org_id and org_scheme:
+ aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
+ return aff_id
class NamesServiceWriter(ServiceWriter):
diff --git a/tests/contrib/names/test_names_datastreams.py b/tests/contrib/names/test_names_datastreams.py
index 299bb274..c9920947 100644
--- a/tests/contrib/names/test_names_datastreams.py
+++ b/tests/contrib/names/test_names_datastreams.py
@@ -8,6 +8,8 @@
"""Names data streams tests."""
+import os
+import tempfile
from copy import deepcopy
from unittest.mock import patch
@@ -23,6 +25,7 @@
)
from invenio_vocabularies.datastreams import StreamEntry
from invenio_vocabularies.datastreams.errors import TransformerError, WriterError
+from invenio_vocabularies.datastreams.transformers import XMLTransformer
@pytest.fixture(scope="function")
@@ -43,52 +46,115 @@ def name_full_data():
}
-@pytest.fixture(scope="module")
-def expected_from_xml():
- return {
- "id": "0000-0001-8135-3489",
- "given_name": "Lars Holm",
- "family_name": "Nielsen",
- "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}],
- "affiliations": [{"name": "CERN"}],
- }
-
-
-XML_ENTRY_DATA = bytes(
- '\n'
- '\n'
- " \n"
- " https://orcid.org/0000-0001-8135-3489\n" # noqa
- " 0000-0001-8135-3489\n"
- " orcid.org\n"
- " \n"
- ' \n'
- ' \n' # noqa
- " Lars Holm" # noqa
- " Nielsen\n" # noqa
- " \n"
- ' \n' # noqa
- " \n"
- ' \n' # noqa
- ' \n' # noqa
- " \n"
- " \n"
- " \n"
- " CERN\n"
- " \n"
- " \n"
- " \n"
- " \n"
- " \n"
- "\n",
- encoding="raw_unicode_escape",
-)
-
-
-@pytest.fixture(scope="function")
-def bytes_xml_data():
- # simplified version of an XML file of the ORCiD dump
- return XML_ENTRY_DATA
+# NOTE: This is a simplified version of the original XML data from ORCiD. Sections like
+# works, funding, educations, etc. and attributes have been removed.
+XML_ENTRY_DATA = b"""
+
+
+ https://orcid.org/0000-0001-8135-3489
+ 0000-0001-8135-3489
+ orcid.org
+
+
+
+ Lars Holm
+ Nielsen
+
+
+
+
+
+
+
+ 2012
+ 03
+ 16
+
+
+ CERN
+
+ https://ror.org/01ggx4157
+
+ ROR
+
+
+
+
+
+
+
+ 2024
+ 01
+
+
+ ACME Inc.
+
+ grid.123456.z
+ GRID
+
+
+
+
+
+
+
+ 2007
+ 08
+
+
+ 2012
+ 03
+
+
+ European Southern Observatory
+
+ grid.424907.c
+ GRID
+
+
+
+
+
+
+
+"""
+
+XML_ENTRY_DATA_SINGLE_EMPLOYMENT = b"""
+
+
+ https://orcid.org/0000-0001-8135-3489
+ 0000-0001-8135-3489
+ orcid.org
+
+
+
+ Lars Holm
+ Nielsen
+
+
+
+
+
+
+
+ 2012
+ 03
+ 16
+
+
+ CERN
+
+ https://ror.org/01ggx4157
+
+ ROR
+
+
+
+
+
+
+
+"""
NAMES_TEST = {
@@ -101,69 +167,206 @@ def bytes_xml_data():
@pytest.fixture(scope="function")
-def dict_xml_entry():
- return StreamEntry(
+def orcid_data():
+ base = {
+ "orcid-identifier": {
+ "uri": "https://orcid.org/0000-0001-8135-3489",
+ "path": "0000-0001-8135-3489",
+ "host": "orcid.org",
+ },
+ "person": {
+ "name": {"given-names": "Lars Holm", "family-name": "Nielsen"},
+ },
+ }
+
+ employments = [
+ {
+ "employment-summary": {
+ "organization": {
+ "name": "CERN",
+ "disambiguated-organization": {
+ "disambiguated-organization-identifier": "https://ror.org/01ggx4157",
+ "disambiguation-source": "ROR",
+ },
+ },
+ "start-date": {"year": "2012", "month": "03", "day": "16"},
+ }
+ },
{
- "orcid-identifier": {
- "uri": "https://orcid.org/0000-0001-8135-3489",
- "path": "0000-0001-8135-3489",
- "host": "orcid.org",
+ "employment-summary": {
+ "organization": {
+ "name": "ACME Inc.",
+ "disambiguated-organization": {
+ "disambiguated-organization-identifier": "grid.123456.z",
+ "disambiguation-source": "GRID",
+ },
+ },
+ "start-date": {"year": "2024", "month": "01"},
+ }
+ },
+ {
+ "employment-summary": {
+ "organization": {
+ "name": "European Southern Observatory",
+ "disambiguated-organization": {
+ "disambiguated-organization-identifier": "grid.424907.c",
+ "disambiguation-source": "GRID",
+ },
+ },
+ "start-date": {"year": "2007", "month": "08"},
+ "end-date": {"year": "2012", "month": "03"},
},
- "person": {
- "name": {
- "given-names": "Lars Holm",
- "family-name": "Nielsen",
- "@visibility": "public",
- "@path": "0000-0001-8135-3489",
+ },
+ ]
+
+ return {
+ "xml": {
+ "multi_employment": XML_ENTRY_DATA,
+ "single_employment": XML_ENTRY_DATA_SINGLE_EMPLOYMENT,
+ },
+ "json": {
+ "multi_employment": {
+ **base,
+ "activities-summary": {
+ "employments": {"affiliation-group": employments},
},
- "external-identifiers": {
- "@path": "/0000-0001-8135-3489/external-identifiers"
+ },
+ "single_employment": {
+ **base,
+ "activities-summary": {
+ # NOTE: Because the XML data has only one employment, the
+ # transformer will not create a list of employments, but instead a
+ # single employment object.
+ "employments": {"affiliation-group": employments[0]}
},
- "@path": "/0000-0001-8135-3489/person",
},
- "activities-summary": {
- "employments": {
- "affiliation-group": {
- "employment-summary": {"organization": {"name": "CERN"}}
- },
- "@path": "/0000-0001-8135-3489/employments",
+ "duplicate_affiliations": {
+ **base,
+ "activities-summary": {
+ "employments": {"affiliation-group": employments + employments},
},
- "@path": "/0000-0001-8135-3489/activities",
},
- "@path": "/0000-0001-8135-3489",
- }
- )
+ },
+ }
-def test_orcid_transformer(dict_xml_entry, expected_from_xml):
+@pytest.fixture(scope="module")
+def expected_from_xml():
+ base = {
+ "id": "0000-0001-8135-3489",
+ "given_name": "Lars Holm",
+ "family_name": "Nielsen",
+ "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}],
+ }
+
+ return {
+ "multi_employment": {
+ **base,
+ "affiliations": [
+ {"id": "01ggx4157", "name": "CERN"},
+ # NOTE: GRID identifiers do not result in linked affiliations
+ {"name": "ACME Inc."},
+ # NOTE: terminated employments are not included in the affiliations
+ ],
+ },
+ "single_employment": {
+ **base,
+ "affiliations": [{"id": "01ggx4157", "name": "CERN"}],
+ },
+ "duplicate_affiliations": {
+ **base,
+ "affiliations": [
+ # Affiliations are deduplicated
+ {"id": "01ggx4157", "name": "CERN"},
+ {"name": "ACME Inc."},
+ ],
+ },
+ }
+
+
+@pytest.fixture(scope="function")
+def org_ids_mapping_file_config(app):
+ """ORCiD organization IDs mappings CSV file."""
+ fout = tempfile.NamedTemporaryFile(mode="w", delete=False)
+ fout.write('"GRID","grid.123456.z","acme_inc_id"\n')
+ fout.close()
+
+ old_config = app.config.get("VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH")
+ app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = fout.name
+ yield
+
+ app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = old_config
+ os.unlink(fout.name)
+
+
+def test_orcid_xml_transform(orcid_data):
+ """Test XML transformer with ORCiD record.
+
+ NOTE: this might look somewhat "redundant", since we're again testing if the
+ XMLTransformer works as expected. However, this is a more specific test
+ that demonstrates how the transformer behaves with more complex XML data, that
+ can have e.g. nested elements that can be arrays or objects.
+ """
+ xml_data = orcid_data["xml"]
+ json_data = orcid_data["json"]
+
+ for key, data in xml_data.items():
+ transformer = XMLTransformer(root_element="record")
+ transform_result = transformer.apply(StreamEntry(data)).entry
+ assert transform_result == json_data[key]
+
+
+def test_orcid_transformer(app, orcid_data, expected_from_xml):
+ """Test ORCiD transformer data."""
transformer = OrcidTransformer()
- assert expected_from_xml == transformer.apply(dict_xml_entry).entry
+ input_data = orcid_data["json"]
+
+ for key, data in input_data.items():
+ assert expected_from_xml[key] == transformer.apply(StreamEntry(data)).entry, key
+
+
+def test_orcid_transformer_org_ids_mapping_from_file(
+ app,
+ orcid_data,
+ expected_from_xml,
+ org_ids_mapping_file_config,
+):
+ """Test ORCiD transformer data with org IDs mapping file."""
+ transformer = OrcidTransformer()
+ input_data = orcid_data["json"]["multi_employment"]
+ expected = deepcopy(expected_from_xml["multi_employment"])
+ expected["affiliations"] = [
+ {"id": "01ggx4157", "name": "CERN"},
+ {"id": "acme_inc_id", "name": "ACME Inc."},
+ ]
+
+ assert expected == transformer.apply(StreamEntry(input_data)).entry
@pytest.mark.parametrize("name,is_valid_name", NAMES_TEST.items())
-def test_orcid_transformer_different_names(dict_xml_entry, name, is_valid_name):
+def test_orcid_transformer_name_filtering(orcid_data, name, is_valid_name):
transformer = OrcidTransformer()
- val = deepcopy(dict_xml_entry)
+ val = deepcopy(orcid_data["json"]["multi_employment"])
if is_valid_name:
- assert transformer.apply(val).entry
+ assert transformer.apply(StreamEntry(val)).entry
else:
with pytest.raises(TransformerError):
- val.entry["person"]["name"]["given-names"] = name
- val.entry["person"]["name"]["family-name"] = ""
- transformer.apply(val)
+ val["person"]["name"]["given-names"] = name
+ val["person"]["name"]["family-name"] = ""
+ transformer.apply(StreamEntry(val))
with pytest.raises(TransformerError):
- val.entry["person"]["name"]["given-names"] = ""
- val.entry["person"]["name"]["family-name"] = name
- transformer.apply(val)
+ val["person"]["name"]["given-names"] = ""
+ val["person"]["name"]["family-name"] = name
+ transformer.apply(StreamEntry(val))
@pytest.mark.parametrize("name", NAMES_TEST.keys())
-def test_orcid_transformer_different_names_no_regex(dict_xml_entry, name):
+def test_orcid_transformer_different_names_no_regex(orcid_data, name):
transformer = OrcidTransformer(names_exclude_regex=None)
- val = deepcopy(dict_xml_entry)
- val.entry["person"]["name"]["given-names"] = name
- val.entry["person"]["name"]["family-name"] = ""
- assert transformer.apply(val).entry
+ val = deepcopy(orcid_data["json"]["multi_employment"])
+ val["person"]["name"]["given-names"] = name
+ val["person"]["name"]["family-name"] = ""
+ assert transformer.apply(StreamEntry(val)).entry
class MockResponse:
@@ -172,14 +375,14 @@ class MockResponse:
@patch("requests.get", side_effect=lambda url, headers: MockResponse())
-def test_orcid_http_reader(_, bytes_xml_data):
+def test_orcid_http_reader(_):
reader = OrcidHTTPReader(id="0000-0001-8135-3489")
results = []
for entry in reader.read():
results.append(entry)
assert len(results) == 1
- assert bytes_xml_data == results[0]
+ assert XML_ENTRY_DATA == results[0]
def test_names_service_writer_create(app, db, search_clear, name_full_data):
diff --git a/tests/datastreams/test_transformers.py b/tests/datastreams/test_transformers.py
index 8c5b1943..dd7628d7 100644
--- a/tests/datastreams/test_transformers.py
+++ b/tests/datastreams/test_transformers.py
@@ -18,26 +18,51 @@
@pytest.fixture(scope="module")
def expected_from_xml():
return {
- "record": {
- "field_one": "value",
- "multi_field": {"some": "value", "another": "value too"},
- }
+ "top_level_field": "top-level single value",
+ "top_level_object_field": {
+ "some": "value",
+ "another": "value too",
+ "nested_array_field": {
+ "@array_attr": "value",
+ "array_element_object": [
+ {
+ "@obj_attr": "first",
+ "element_foo": "value1",
+ "element_bar": "value1",
+ },
+ {
+ "@obj_attr": "second",
+ "element_foo": "value2",
+ "element_bar": "value2",
+ "element_qux": "value2",
+ },
+ ],
+ },
+ },
}
def test_xml_transformer(expected_from_xml):
bytes_xml_entry = StreamEntry(
- bytes(
- '\n'
- "\n"
- " value\n"
- " \n"
- " value\n"
- " value too\n"
- " \n"
- "\n",
- encoding="raw_unicode_escape",
- )
+ b"""
+
+ top-level single value
+
+ value
+ value too
+
+
+ value1
+ value1
+
+
+ value2
+ value2
+ value2
+
+
+
+ """
)
transformer = XMLTransformer()
@@ -46,15 +71,25 @@ def test_xml_transformer(expected_from_xml):
def test_bad_xml_transformer():
bytes_xml_entry = StreamEntry(
- bytes(
- '\n'
- "value\n"
- "\n"
- " value\n"
- " value too\n"
- "\n",
- encoding="raw_unicode_escape",
- )
+ b"""
+
+ top-level single value
+
+ value
+ value too
+
+
+ value1
+ value1
+
+
+ value2
+ value2
+ value2
+
+
+
+ """
)
transformer = XMLTransformer(root_element="field_two")