Skip to content

Commit

Permalink
subjects: added datastream for bodc
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Nov 27, 2024
1 parent 1fbf3c5 commit e8dd5da
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 0 deletions.
3 changes: 3 additions & 0 deletions invenio_vocabularies/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ def is_edmo(val):
)
"""Subject GEMET file download link."""

VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL = "http://vocab.nerc.ac.uk/collection/P01/current/?_profile=nvs&_mediatype=application/rdf+xml"
"""Subject BODC-PUV file download link."""

VOCABULARIES_AFFILIATIONS_EDMO_COUNTRY_MAPPING = {
"Cape Verde": "Cabo Verde",
}
Expand Down
9 changes: 9 additions & 0 deletions invenio_vocabularies/contrib/subjects/bodc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""BODC Subjects module."""
81 changes: 81 additions & 0 deletions invenio_vocabularies/contrib/subjects/bodc/datastreams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""BODC subjects datastreams, readers, transformers, and writers."""

from rdflib.namespace import OWL, RDFS

from invenio_vocabularies.datastreams.readers import RDFReader
from invenio_vocabularies.datastreams.transformers import RDFTransformer

from ..config import bodc_puv_file_url


class BODCPUVSubjectsTransformer(RDFTransformer):
"""Transformer class to convert BODC-PUV RDF data to a dictionary format."""

def _get_subject_data(self, rdf_graph, subject):
"""Fetch all triples for a subject and organize them into a dictionary."""
data = {}
for predicate, obj in rdf_graph.predicate_objects(subject=subject):
predicate_name = str(predicate)
if predicate_name not in data:
data[predicate_name] = []
data[predicate_name].append(obj)
return data

def _transform_entry(self, subject, rdf_graph):
"""Transform an entry to the required dictionary format."""
subject_data = self._get_subject_data(rdf_graph, subject)
deprecated = subject_data.get(str(OWL.deprecated), [False])
if deprecated and str(deprecated[0]).lower() == "true":
return None # Skip deprecated subjects

notation = subject_data.get(str(self.skos_core.notation), [])
id = notation[0] if notation else None

labels = {
obj.language: obj.value.capitalize()
for obj in subject_data.get(str(self.skos_core.prefLabel), [])
if obj.language and "-" not in obj.language
}
alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
subject_text = alt_labels[0] if alt_labels else None

identifiers = [{"scheme": "url", "identifier": str(subject)}]
props = {}

return {
"id": id,
"scheme": "BODC-PUV",
"subject": subject_text,
"title": labels,
"props": props,
"identifiers": identifiers,
}


# Configuration for datastream transformers, and writers
VOCABULARIES_DATASTREAM_READERS = {}
VOCABULARIES_DATASTREAM_WRITERS = {}

VOCABULARIES_DATASTREAM_TRANSFORMERS = {"bodc-transformer": BODCPUVSubjectsTransformer}

DATASTREAM_CONFIG = {
"readers": [
{
"type": "http",
"args": {
"origin": bodc_puv_file_url,
},
},
{"type": "rdf"},
],
"transformers": [{"type": "bodc-transformer"}],
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
}
4 changes: 4 additions & 0 deletions invenio_vocabularies/contrib/subjects/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
)

bodc_puv_file_url = LocalProxy(
lambda: current_app.config["VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL"]
)


class SubjectsSearchOptions(SearchOptions):
"""Search options."""
Expand Down
4 changes: 4 additions & 0 deletions invenio_vocabularies/contrib/subjects/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .bodc import datastreams as bodc_datastreams
from .euroscivoc import datastreams as euroscivoc_datastreams
from .gemet import datastreams as gemet_datastreams
from .mesh import datastreams as mesh_datastreams
Expand All @@ -34,13 +35,15 @@ def _entry_id(self, entry):
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
**bodc_datastreams.VOCABULARIES_DATASTREAM_READERS,
}
"""Subjects Data Streams readers."""

VOCABULARIES_DATASTREAM_TRANSFORMERS = {
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**bodc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
}
"""Subjects Data Streams transformers."""

Expand All @@ -49,6 +52,7 @@ def _entry_id(self, entry):
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**bodc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
}
"""Subjects Data Streams writers."""

Expand Down

0 comments on commit e8dd5da

Please sign in to comment.