inveniosoftware · jrcastro2 · Dec 6, 2024 · Nov 25, 2024 · Nov 25, 2024 · Dec 5, 2024
diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
@@ -213,3 +213,16 @@ def is_edmo(val):
     "days": 1,
 }
 """ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
+
+VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None
+"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs.
+
+The path can be specified as either an absolute path or a relative path within the
+Flask app instance folder (i.e. ``current_app.instance_path``).
+
+The CSV file should have the following columns:
+
+- `org_scheme`: The ORCiD organization ID.
+- `org_id`: The ORCiD organization ID.
+- `aff_id`: The affiliation ID to map to.
+"""
diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py
@@ -13,12 +13,13 @@
 import tarfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import timedelta
+from pathlib import Path
 
 import arrow
 import regex as re
 from flask import current_app
 from invenio_access.permissions import system_identity
-from invenio_records.dictutils import dict_lookup
+from werkzeug.utils import cached_property
 
 from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
 
@@ -48,9 +49,8 @@ def _fetch_orcid_data(self, orcid_to_sync, bucket):
         key = f"{suffix}/{orcid_to_sync}.xml"
         try:
             return self.s3_client.read_file(f"s3://{bucket}/{key}")
-        except Exception as e:
-            # TODO: log
-            return None
+        except Exception:
+            current_app.logger.exception("Failed to fetch ORCiD record.")
 
     def _process_lambda_file(self, fileobj):
         """Process the ORCiD lambda file and returns a list of ORCiDs to sync.
@@ -139,24 +139,75 @@ def __init__(self, *args, test_mode=True, **kwargs):
 
 
 DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
-"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
+"""Regex to filter out names with punctuation, symbols, numbers and emojis."""
+
+
+class OrcidOrgToAffiliationMapper:
+    """Default ORCiD Org ID to affiliation ID mapper."""
+
+    def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
+        """Constructor."""
+        self._org_ids_mapping = org_ids_mapping
+        self._org_ids_mapping_file = org_ids_mapping_file
+
+    @cached_property
+    def org_ids_mapping(self):
+        """Mapping of ORCiD org IDs to affiliation IDs."""
+        org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
+            "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
+        )
+        if org_ids_mapping_file:
+            org_ids_mapping_file = Path(org_ids_mapping_file)
+            # If the path is relative, prepend the instance path
+            if not org_ids_mapping_file.is_absolute():
+                org_ids_mapping_file = (
+                    Path(current_app.instance_path) / org_ids_mapping_file
+                )
+            with open(org_ids_mapping_file) as fin:
+                result = {}
+                reader = csv.reader(fin)
+
+                # Check if the first row is a header
+                org_scheme, org_id, aff_id = next(reader)
+                if org_scheme.lower() != "org_scheme":
+                    result[(org_scheme, org_id)] = aff_id
+
+                for org_scheme, org_id, aff_id in reader:
+                    result[(org_scheme, org_id)] = aff_id
+
+                return result
+
+        return self._org_ids_mapping or {}
+
+    def __call__(self, org_scheme, org_id):
+        """Map an ORCiD org ID to an affiliation ID."""
+        # By default we know that ROR IDs are linkable
+        if org_scheme == "ROR":
+            return org_id.split("/")[-1]
+        # Otherwise see if we have a mapping from other schemes to an affiliation ID
+        return self.org_ids_mapping.get((org_scheme, org_id))
 
 
 class OrcidTransformer(BaseTransformer):
     """Transforms an ORCiD record into a names record."""
 
     def __init__(
-        self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
+        self,
+        *args,
+        names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
+        org_id_to_affiliation_id_func=None,
+        **kwargs,
     ) -> None:
         """Constructor."""
         self._names_exclude_regex = names_exclude_regex
+        self._org_id_to_affiliation_id_func = (
+            org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
+        )
         super().__init__()
 
-    def _is_valid_name(self, name):
-        """Check whether the name passes the regex."""
-        if not self._names_exclude_regex:
-            return True
-        return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+    def org_id_to_affiliation_id(self, org_scheme, org_id):
+        """Convert and ORCiD org ID to a linkable affiliation ID."""
+        return self._org_id_to_affiliation_id_func(org_scheme, org_id)
 
     def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
@@ -166,42 +217,88 @@ def apply(self, stream_entry, **kwargs):
 
         name = person.get("name")
         if name is None:
-            raise TransformerError(f"Name not found in ORCiD entry.")
+            raise TransformerError("Name not found in ORCiD entry.")
         if name.get("family-name") is None:
-            raise TransformerError(f"Family name not found in ORCiD entry.")
+            raise TransformerError("Family name not found in ORCiD entry.")
 
         if not self._is_valid_name(name["given-names"] + name["family-name"]):
-            raise TransformerError(f"Invalid characters in name.")
+            raise TransformerError("Invalid characters in name.")
 
         entry = {
             "id": orcid_id,
             "given_name": name.get("given-names"),
             "family_name": name.get("family-name"),
             "identifiers": [{"scheme": "orcid", "identifier": orcid_id}],
-            "affiliations": [],
+            "affiliations": self._extract_affiliations(record),
         }
 
+        stream_entry.entry = entry
+        return stream_entry
+
+    def _is_valid_name(self, name):
+        """Check whether the name passes the regex."""
+        if not self._names_exclude_regex:
+            return True
+        return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
+
+    def _extract_affiliations(self, record):
+        """Extract affiliations from the ORCiD record."""
+        result = []
         try:
-            employments = dict_lookup(
-                record, "activities-summary.employments.affiliation-group"
+            employments = (
+                record.get("activities-summary", {})
+                .get("employments", {})
+                .get("affiliation-group", [])
             )
+
+            # If there are single values, the XML to dict, doesn't wrap them in a list
             if isinstance(employments, dict):
                 employments = [employments]
-            history = set()
+
+            # Remove the "employment-summary" nesting
+            employments = [
+                employment.get("employment-summary", {}) for employment in employments
+            ]
+
             for employment in employments:
-                terminated = employment["employment-summary"].get("end-date")
-                affiliation = dict_lookup(
-                    employment,
-                    "employment-summary.organization.name",
-                )
-                if affiliation not in history and not terminated:
-                    history.add(affiliation)
-                    entry["affiliations"].append({"name": affiliation})
+                terminated = employment.get("end-date")
+                if terminated:
+                    continue
+
+                org = employment["organization"]
+                aff_id = self._extract_affiliation_id(org)
+
+                # Skip adding if the ID already exists in result
+                if aff_id and any(aff.get("id") == aff_id for aff in result):
+                    continue
+
+                # Skip adding if the name exists in result with no ID
+                if any(
+                    aff.get("name") == org["name"] and "id" not in aff for aff in result
+                ):
+                    continue
+
+                aff = {"name": org["name"]}
+                if aff_id:
+                    aff["id"] = aff_id
+
+                result.append(aff)
         except Exception:
             pass
-
-        stream_entry.entry = entry
-        return stream_entry
+        return result
+
+    def _extract_affiliation_id(self, org):
+        """Extract the affiliation ID from an ORCiD organization."""
+        dis_org = org.get("disambiguated-organization")
+        if not dis_org:
+            return
+
+        aff_id = None
+        org_id = dis_org.get("disambiguated-organization-identifier")
+        org_scheme = dis_org.get("disambiguation-source")
+        if org_id and org_scheme:
+            aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
+        return aff_id
 
 
 class NamesServiceWriter(ServiceWriter):