Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

subjects: added datastream for bodc #445

Merged
merged 3 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions invenio_vocabularies/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ def is_edmo(val):
)
"""Subject GEMET file download link."""

VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL = "http://vocab.nerc.ac.uk/collection/P01/current/?_profile=nvs&_mediatype=application/rdf+xml"
"""Subject BODC-PUV file download link."""

VOCABULARIES_AFFILIATIONS_EDMO_COUNTRY_MAPPING = {
"Cape Verde": "Cabo Verde",
}
Expand Down
9 changes: 9 additions & 0 deletions invenio_vocabularies/contrib/subjects/bodc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""BODC Subjects module."""
111 changes: 111 additions & 0 deletions invenio_vocabularies/contrib/subjects/bodc/datastreams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""BODC subjects datastreams, readers, transformers, and writers."""

from invenio_vocabularies.datastreams.errors import TransformerError
from invenio_vocabularies.datastreams.readers import RDFReader
from invenio_vocabularies.datastreams.transformers import RDFTransformer

from ..config import bodc_puv_file_url

# Available with the "rdf" extra
try:
import rdflib
except ImportError:
rdflib = None


class BODCPUVSubjectsTransformer(RDFTransformer):
"""
Transformer class to convert BODC-PUV RDF data to a dictionary format.

Input:
- Relevant fields:
- `skos:notation`: Primary identifier for the concept.
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:altLabel`: Alternative labels (optional).
- `skos:definition`: Definitions of the concept.
- `owl:deprecated`: Boolean flag indicating if the concept is deprecated.

Output:
- A dictionary with the following structure:
{
"id": "SDN:P01::SAGEMSFM", # BODC-specific parameter ID (skos:notation).
"scheme": "BODC-PUV", # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
"subject": "AMSSedAge", # The alternative label (skos:altLabel), if available, or None.
"title": {
"en": "14C age of Foraminiferida" # English preferred label (skos:prefLabel).
},
"props": {
"definitions": "Accelerated mass spectrometry on picked tests", # Definition of subject (skos:definition).
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM" # URI of the concept.
}
]
}
"""

def _get_subject_data(self, rdf_graph, subject):
"""Fetch all triples for a subject and organize them into a dictionary."""
data = {}
for predicate, obj in rdf_graph.predicate_objects(subject=subject):
predicate_name = str(predicate)
if predicate_name not in data:
data[predicate_name] = []
data[predicate_name].append(obj)
return data

def _transform_entry(self, subject, rdf_graph):
"""Transform an entry to the required dictionary format."""
labels = self._get_labels(subject, rdf_graph)
subject_data = self._get_subject_data(rdf_graph, subject)
deprecated = subject_data.get(str(rdflib.namespace.OWL.deprecated), [False])
if deprecated and str(deprecated[0]).lower() == "true":
return None # Skip deprecated subjects

notation = subject_data.get(str(self.skos_core.notation), [])
if notation:
id = str(notation[0])
else:
raise TransformerError(f"No id found for: {subject}")

alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
subject_text = str(alt_labels[0]) if alt_labels else ""
definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])

return {
"id": id,
"scheme": "BODC-PUV",
"subject": subject_text,
"title": labels,
"props": {"definition": definition} if definition else {},
"identifiers": self._get_identifiers(subject),
}


# Configuration for datastream

VOCABULARIES_DATASTREAM_TRANSFORMERS = {"bodc-transformer": BODCPUVSubjectsTransformer}

DATASTREAM_CONFIG = {
"readers": [
{
"type": "http",
"args": {
"origin": bodc_puv_file_url,
},
},
{"type": "rdf"},
],
"transformers": [{"type": "bodc-transformer"}],
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
}
4 changes: 4 additions & 0 deletions invenio_vocabularies/contrib/subjects/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
)

bodc_puv_file_url = LocalProxy(
lambda: current_app.config["VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL"]
)


class SubjectsSearchOptions(SearchOptions):
"""Search options."""
Expand Down
6 changes: 2 additions & 4 deletions invenio_vocabularies/contrib/subjects/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .bodc import datastreams as bodc_datastreams
from .euroscivoc import datastreams as euroscivoc_datastreams
from .gemet import datastreams as gemet_datastreams
from .mesh import datastreams as mesh_datastreams
Expand All @@ -32,23 +33,20 @@ def _entry_id(self, entry):

VOCABULARIES_DATASTREAM_READERS = {
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
}
"""Subjects Data Streams readers."""

VOCABULARIES_DATASTREAM_TRANSFORMERS = {
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**bodc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
}
"""Subjects Data Streams transformers."""

VOCABULARIES_DATASTREAM_WRITERS = {
"subjects-service": SubjectsServiceWriter,
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
}
"""Subjects Data Streams writers."""

Expand Down
44 changes: 33 additions & 11 deletions invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,36 @@


class EuroSciVocSubjectsTransformer(RDFTransformer):
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
"""
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved
Transformer class to convert EuroSciVoc RDF data to a dictionary format.

Input:
- Relevant fields:
- `skos:notation`: Primary identifier for the concept.
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:altLabel`: Alternative labels.
- `skos:broader`: Broader concepts that this concept belongs to.

Output:
{
"id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
"scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
"subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
"title": {
"it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
"en": "Satellite radio", # English preferred label (skos:prefLabel).
},
"props": {
"parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
}
],
}
"""
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved

def _get_notation(self, subject, rdf_graph):
"""Extract the numeric notation for a subject."""
Expand All @@ -38,21 +67,18 @@ def _transform_entry(self, subject, rdf_graph):
for n in reversed(self._find_parents(subject, rdf_graph))
if n
)
identifiers = [{"scheme": "url", "identifier": str(subject)}]

return {
"id": id,
"scheme": "EuroSciVoc",
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": {"parents": parents} if parents else {},
"identifiers": identifiers,
"identifiers": self._get_identifiers(subject),
}


# Configuration for datastream transformers, and writers
VOCABULARIES_DATASTREAM_READERS = {}
VOCABULARIES_DATASTREAM_WRITERS = {}
# Configuration for datastream

VOCABULARIES_DATASTREAM_TRANSFORMERS = {
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
Expand All @@ -71,9 +97,5 @@ def _transform_entry(self, subject, rdf_graph):
},
],
"transformers": [{"type": "euroscivoc-transformer"}],
"writers": [
{
"type": "subjects-service",
}
],
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
}
41 changes: 36 additions & 5 deletions invenio_vocabularies/contrib/subjects/gemet/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,40 @@


class GEMETSubjectsTransformer(RDFTransformer):
"""Transformer class to convert GEMET RDF data to a dictionary format."""
"""
Transformer class to convert GEMET RDF data to a dictionary format.

Input:
- Relevant fields:
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:broader`: References to broader concepts (parent concepts).
- `skos:memberOf`: References to groups or themes the concept belongs to.

Output:
- A dictionary with the following structure:
{
"id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
"scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
"subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
"title": {
"en": "Consumer product", # English label for the concept (skos:prefLabel).
"ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
},
"props": {
"parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
"groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
"themes": [
"http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
]
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
}
]
}
"""

def _get_parent_notation(self, broader, rdf_graph):
"""Extract parent notation from GEMET URI."""
Expand Down Expand Up @@ -83,13 +116,11 @@ def _transform_entry(self, subject, rdf_graph):
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": props,
"identifiers": identifiers,
"identifiers": self._get_identifiers(subject),
}


# Configuration for datastream transformers, and writers
VOCABULARIES_DATASTREAM_READERS = {}
VOCABULARIES_DATASTREAM_WRITERS = {}
# Configuration for datastream

VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}

Expand Down
12 changes: 12 additions & 0 deletions invenio_vocabularies/datastreams/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""Transformers module."""

from abc import ABC, abstractmethod
from urllib.parse import urlparse

from lxml import etree

Expand Down Expand Up @@ -76,6 +77,17 @@ def skos_core(self):
"""Get the SKOS core namespace."""
return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")

def _validate_subject_url(self, subject):
"""Check if the subject is a valid URL."""
parsed = urlparse(str(subject))
return bool(parsed.netloc and parsed.scheme)

def _get_identifiers(self, subject):
"""Generate identifiers field for a valid subject URL."""
if self._validate_subject_url(subject):
return [{"scheme": "url", "identifier": str(subject)}]
return []

def _get_labels(self, subject, rdf_graph):
"""Extract labels (prefLabel or altLabel) for a subject."""
labels = {
Expand Down
41 changes: 41 additions & 0 deletions invenio_vocabularies/factories.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor/shelve: this file is starting to grow somewhat, so I'll shelve an issue to see how we can refactor it and have some sort of "pluggable" system

Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@
)
from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
from .contrib.subjects.bodc.datastreams import DATASTREAM_CONFIG as bodc_ds_config
from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config
from .contrib.subjects.euroscivoc.datastreams import (
DATASTREAM_CONFIG as euroscivoc_ds_config,
)
from .contrib.subjects.gemet.datastreams import DATASTREAM_CONFIG as gemet_ds_config


class VocabularyConfig:
Expand Down Expand Up @@ -137,6 +142,39 @@ def get_service(self):
raise NotImplementedError("Service not implemented for EDMO Affiliations")


class SubjectsEuroSciVocVocabularyConfig(VocabularyConfig):
"""EuroSciVoc Subjects Vocabulary Config."""

config = euroscivoc_ds_config
vocabulary_name = "subjects:euroscivoc"

def get_service(self):
"""Get the service for the vocabulary."""
raise NotImplementedError("Service not implemented for EuroSciVoc Subjects")


class SubjectsGEMETVocabularyConfig(VocabularyConfig):
"""GEMET Subjects Vocabulary Config."""

config = gemet_ds_config
vocabulary_name = "subjects:gemet"

def get_service(self):
"""Get the service for the vocabulary."""
raise NotImplementedError("Service not implemented for GEMET Subjects")


class SubjectsBODCVocabularyConfig(VocabularyConfig):
"""BODC Subjects Vocabulary Config."""

config = bodc_ds_config
vocabulary_name = "subjects:bodc-puv"

def get_service(self):
"""Get the service for the vocabulary."""
raise NotImplementedError("Service not implemented for BODC Subjects")


def get_vocabulary_config(vocabulary):
"""Factory function to get the appropriate Vocabulary Config."""
vocab_config = {
Expand All @@ -148,5 +186,8 @@ def get_vocabulary_config(vocabulary):
"affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
"affiliations:edmo": AffiliationsEDMOVocabularyConfig,
"subjects": SubjectsVocabularyConfig,
"subjects:gemet": SubjectsGEMETVocabularyConfig,
"subjects:bodc": SubjectsBODCVocabularyConfig,
"subjects:euroscivoc": SubjectsEuroSciVocVocabularyConfig,
}
return vocab_config.get(vocabulary, VocabularyConfig)()
Loading