Skip to content

Commit

Permalink
names: support mapping affiliation IDs
Browse files Browse the repository at this point in the history
* Allows providing a CSV file to the ORCiD transformer for mapping
  organization identifiers enountered in the ORCiD data to linkable
  affiliation IDs.
  • Loading branch information
slint committed Dec 4, 2024
1 parent 147c750 commit 18d00e2
Show file tree
Hide file tree
Showing 3 changed files with 354 additions and 112 deletions.
75 changes: 59 additions & 16 deletions invenio_vocabularies/contrib/names/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import regex as re
from flask import current_app
from invenio_access.permissions import system_identity
from werkzeug.utils import cached_property

from invenio_vocabularies.contrib.names.s3client import S3OrcidClient

Expand Down Expand Up @@ -150,12 +151,16 @@ def __init__(
names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
affiliation_relation_schemes=None,
org_scheme_mappping=None,
org_ids_mapping=None,
org_ids_mapping_file=None,
**kwargs,
) -> None:
"""Constructor."""
self._names_exclude_regex = names_exclude_regex
self._affiliation_relation_schemes = affiliation_relation_schemes
self._org_scheme_mappping = org_scheme_mappping
self._org_ids_mapping = org_ids_mapping
self._org_ids_mapping_file = org_ids_mapping_file
super().__init__()

@property
Expand All @@ -173,6 +178,21 @@ def org_scheme_mappping(self):
"FUNDREF": "fundref",
}

@cached_property
def org_ids_mapping(self):
"""Mapping of ORCiD org IDs to affiliation IDs."""
org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
"VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
)
if org_ids_mapping_file:
with open(org_ids_mapping_file) as fin:
return {
(org_scheme, org_id): ror_id
for org_scheme, org_id, ror_id in csv.reader(fin)
}

return self._org_ids_mapping or {}

def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
record = stream_entry.entry
Expand Down Expand Up @@ -212,36 +232,59 @@ def _extract_affiliations(self, record):
employments = (
record.get("activities-summary", {})
.get("employments", {})
.get("affiliation-group")
.get("affiliation-group", [])
)

# If there are single values, the XML to dict, doesn't wrap them in a list
if isinstance(employments, dict):
employments = [employments]

# Remove the "employment-summary" nesting
employments = [
employment.get("employment-summary", {}) for employment in employments
]

history = set()
for employment in employments:
terminated = employment["employment-summary"].get("end-date")
org = employment["employment-summary"]["organization"]
if org["name"] not in history and not terminated:
history.add(org["name"])
aff = {"name": org["name"]}
terminated = employment.get("end-date")
org = employment["organization"]

if org.get("disambiguated-organization"):
dis_org = org["disambiguated-organization"]
org_id = dis_org.get("disambiguated-organization-identifier")
org_scheme = dis_org.get("disambiguation-source")
aff_scheme = self.org_scheme_mappping.get(org_scheme)
if terminated or org["name"] in history:
continue

if org_id and aff_scheme in self.affiliation_relation_schemes:
if aff_scheme == "ror":
org_id = org_id.split("/")[-1]
history.add(org["name"])
aff = {"name": org["name"]}

aff["id"] = org_id
# Extract the org ID, to link to the affiliation vocabulary
aff_id = self._extract_affiliation_id(org)
if aff_id:
aff["id"] = aff_id

result.append(aff)
result.append(aff)
except Exception:
pass
return result

def _extract_affiliation_id(self, org):
"""Extract the affiliation ID from an ORCiD organization."""
dis_org = org.get("disambiguated-organization")
if not dis_org:
return

aff_id = None

org_id = dis_org.get("disambiguated-organization-identifier")
org_scheme = dis_org.get("disambiguation-source")
aff_scheme = self.org_scheme_mappping.get(org_scheme)
if org_id and aff_scheme in self.affiliation_relation_schemes:
if aff_scheme == "ror":
org_id = org_id.split("/")[-1]
aff_id = org_id
else:
aff_id = self.org_ids_mapping.get((org_scheme, org_id))

return aff_id


class NamesServiceWriter(ServiceWriter):
"""Names service writer."""
Expand Down
Loading

0 comments on commit 18d00e2

Please sign in to comment.