Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

names: link affiliations IDs from ORCiD parsing #441

Merged
merged 5 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions invenio_vocabularies/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,16 @@ def is_edmo(val):
"days": 1,
}
"""ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""

VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None
"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs.

The path can be specified as either an absolute path or a relative path within the
Flask app instance folder (i.e. ``current_app.instance_path``).

The CSV file should have the following columns:

- `org_scheme`: The ORCiD organization ID.
- `org_id`: The ORCiD organization ID.
- `aff_id`: The affiliation ID to map to.
"""
155 changes: 126 additions & 29 deletions invenio_vocabularies/contrib/names/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
import tarfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta
from pathlib import Path

import arrow
import regex as re
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_records.dictutils import dict_lookup
from werkzeug.utils import cached_property

from invenio_vocabularies.contrib.names.s3client import S3OrcidClient

Expand Down Expand Up @@ -48,9 +49,8 @@ def _fetch_orcid_data(self, orcid_to_sync, bucket):
key = f"{suffix}/{orcid_to_sync}.xml"
try:
return self.s3_client.read_file(f"s3://{bucket}/{key}")
except Exception as e:
# TODO: log
return None
except Exception:
current_app.logger.exception("Failed to fetch ORCiD record.")

def _process_lambda_file(self, fileobj):
"""Process the ORCiD lambda file and returns a list of ORCiDs to sync.
Expand Down Expand Up @@ -139,24 +139,75 @@ def __init__(self, *args, test_mode=True, **kwargs):


DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
"""Regex to filter out names with punctuation, symbols, numbers and emojis."""


class OrcidOrgToAffiliationMapper:
"""Default ORCiD Org ID to affiliation ID mapper."""

def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
"""Constructor."""
self._org_ids_mapping = org_ids_mapping
self._org_ids_mapping_file = org_ids_mapping_file

@cached_property
def org_ids_mapping(self):
"""Mapping of ORCiD org IDs to affiliation IDs."""
org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
"VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
)
if org_ids_mapping_file:
org_ids_mapping_file = Path(org_ids_mapping_file)
# If the path is relative, prepend the instance path
if not org_ids_mapping_file.is_absolute():
org_ids_mapping_file = (
Path(current_app.instance_path) / org_ids_mapping_file
)
with open(org_ids_mapping_file) as fin:
result = {}
reader = csv.reader(fin)

# Check if the first row is a header
org_scheme, org_id, aff_id = next(reader)
if org_scheme.lower() != "org_scheme":
result[(org_scheme, org_id)] = aff_id

for org_scheme, org_id, aff_id in reader:
result[(org_scheme, org_id)] = aff_id

return result

return self._org_ids_mapping or {}

def __call__(self, org_scheme, org_id):
"""Map an ORCiD org ID to an affiliation ID."""
# By default we know that ROR IDs are linkable
if org_scheme == "ROR":
return org_id.split("/")[-1]
# Otherwise see if we have a mapping from other schemes to an affiliation ID
return self.org_ids_mapping.get((org_scheme, org_id))


class OrcidTransformer(BaseTransformer):
"""Transforms an ORCiD record into a names record."""

def __init__(
self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
self,
*args,
names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
org_id_to_affiliation_id_func=None,
**kwargs,
) -> None:
"""Constructor."""
self._names_exclude_regex = names_exclude_regex
self._org_id_to_affiliation_id_func = (
org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
)
super().__init__()

def _is_valid_name(self, name):
"""Check whether the name passes the regex."""
if not self._names_exclude_regex:
return True
return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
def org_id_to_affiliation_id(self, org_scheme, org_id):
"""Convert and ORCiD org ID to a linkable affiliation ID."""
return self._org_id_to_affiliation_id_func(org_scheme, org_id)

def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
Expand All @@ -166,42 +217,88 @@ def apply(self, stream_entry, **kwargs):

name = person.get("name")
if name is None:
raise TransformerError(f"Name not found in ORCiD entry.")
raise TransformerError("Name not found in ORCiD entry.")
if name.get("family-name") is None:
raise TransformerError(f"Family name not found in ORCiD entry.")
raise TransformerError("Family name not found in ORCiD entry.")

if not self._is_valid_name(name["given-names"] + name["family-name"]):
raise TransformerError(f"Invalid characters in name.")
raise TransformerError("Invalid characters in name.")

entry = {
"id": orcid_id,
"given_name": name.get("given-names"),
"family_name": name.get("family-name"),
"identifiers": [{"scheme": "orcid", "identifier": orcid_id}],
"affiliations": [],
"affiliations": self._extract_affiliations(record),
}

stream_entry.entry = entry
return stream_entry

def _is_valid_name(self, name):
"""Check whether the name passes the regex."""
if not self._names_exclude_regex:
return True
return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))

def _extract_affiliations(self, record):
"""Extract affiliations from the ORCiD record."""
result = []
try:
employments = dict_lookup(
record, "activities-summary.employments.affiliation-group"
employments = (
record.get("activities-summary", {})
.get("employments", {})
.get("affiliation-group", [])
)

# If there are single values, the XML to dict, doesn't wrap them in a list
if isinstance(employments, dict):
employments = [employments]
history = set()

# Remove the "employment-summary" nesting
employments = [
employment.get("employment-summary", {}) for employment in employments
]

for employment in employments:
terminated = employment["employment-summary"].get("end-date")
affiliation = dict_lookup(
employment,
"employment-summary.organization.name",
)
if affiliation not in history and not terminated:
history.add(affiliation)
entry["affiliations"].append({"name": affiliation})
terminated = employment.get("end-date")
if terminated:
continue

org = employment["organization"]
aff_id = self._extract_affiliation_id(org)

# Skip adding if the ID already exists in result
if aff_id and any(aff.get("id") == aff_id for aff in result):
continue

# Skip adding if the name exists in result with no ID
if any(
aff.get("name") == org["name"] and "id" not in aff for aff in result
):
continue

aff = {"name": org["name"]}
if aff_id:
aff["id"] = aff_id

result.append(aff)
except Exception:
pass

stream_entry.entry = entry
return stream_entry
return result

def _extract_affiliation_id(self, org):
"""Extract the affiliation ID from an ORCiD organization."""
dis_org = org.get("disambiguated-organization")
if not dis_org:
return

aff_id = None
org_id = dis_org.get("disambiguated-organization-identifier")
org_scheme = dis_org.get("disambiguation-source")
if org_id and org_scheme:
aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
return aff_id


class NamesServiceWriter(ServiceWriter):
Expand Down
Loading