inveniosoftware · slint · Dec 9, 2024 · Nov 27, 2024 · Nov 28, 2024 · Dec 4, 2024
diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
@@ -196,6 +196,9 @@ def is_edmo(val):
 )
 """Subject GEMET file download link."""
 
+VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL = "http://vocab.nerc.ac.uk/collection/P01/current/?_profile=nvs&_mediatype=application/rdf+xml"
+"""Subject BODC-PUV file download link."""
+
 VOCABULARIES_AFFILIATIONS_EDMO_COUNTRY_MAPPING = {
     "Cape Verde": "Cabo Verde",
 }

diff --git a/invenio_vocabularies/contrib/subjects/bodc/__init__.py b/invenio_vocabularies/contrib/subjects/bodc/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""BODC Subjects module."""
diff --git a/invenio_vocabularies/contrib/subjects/bodc/datastreams.py b/invenio_vocabularies/contrib/subjects/bodc/datastreams.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""BODC subjects datastreams, readers, transformers, and writers."""
+
+from invenio_vocabularies.datastreams.errors import TransformerError
+from invenio_vocabularies.datastreams.readers import RDFReader
+from invenio_vocabularies.datastreams.transformers import RDFTransformer
+
+from ..config import bodc_puv_file_url
+
+# Available with the "rdf" extra
+try:
+    import rdflib
+except ImportError:
+    rdflib = None
+
+
+class BODCPUVSubjectsTransformer(RDFTransformer):
+    """
+    Transformer class to convert BODC-PUV RDF data to a dictionary format.
+
+    Input:
+        - Relevant fields:
+            - `skos:notation`: Primary identifier for the concept.
+            - `skos:prefLabel`: Preferred labels with language codes.
+            - `skos:altLabel`: Alternative labels (optional).
+            - `skos:definition`: Definitions of the concept.
+            - `owl:deprecated`: Boolean flag indicating if the concept is deprecated.
+
+    Output:
+        - A dictionary with the following structure:
+            {
+                "id": "SDN:P01::SAGEMSFM",  # BODC-specific parameter ID (skos:notation).
+                "scheme": "BODC-PUV",  # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
+                "subject": "AMSSedAge",  # The alternative label (skos:altLabel), if available, or None.
+                "title": {
+                    "en": "14C age of Foraminiferida"  # English preferred label (skos:prefLabel).
+                },
+                "props": {
+                    "definitions": "Accelerated mass spectrometry on picked tests",  # Definition of subject (skos:definition).
+                },
+                "identifiers": [
+                    {
+                        "scheme": "url",  # Type of identifier (URL).
+                        "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM"  # URI of the concept.
+                    }
+                ]
+            }
+    """
+
+    def _get_subject_data(self, rdf_graph, subject):
+        """Fetch all triples for a subject and organize them into a dictionary."""
+        data = {}
+        for predicate, obj in rdf_graph.predicate_objects(subject=subject):
+            predicate_name = str(predicate)
+            if predicate_name not in data:
+                data[predicate_name] = []
+            data[predicate_name].append(obj)
+        return data
+
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        labels = self._get_labels(subject, rdf_graph)
+        subject_data = self._get_subject_data(rdf_graph, subject)
+        deprecated = subject_data.get(str(rdflib.namespace.OWL.deprecated), [False])
+        if deprecated and str(deprecated[0]).lower() == "true":
+            return None  # Skip deprecated subjects
+
+        notation = subject_data.get(str(self.skos_core.notation), [])
+        if notation:
+            id = str(notation[0])
+        else:
+            raise TransformerError(f"No id found for: {subject}")
+
+        alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
+        subject_text = str(alt_labels[0]) if alt_labels else ""
+        definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])
+
+        return {
+            "id": id,
+            "scheme": "BODC-PUV",
+            "subject": subject_text,
+            "title": labels,
+            "props": {"definition": definition} if definition else {},
+            "identifiers": self._get_identifiers(subject),
+        }
+
+
+# Configuration for datastream
+
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {"bodc-transformer": BODCPUVSubjectsTransformer}
+
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "http",
+            "args": {
+                "origin": bodc_puv_file_url,
+            },
+        },
+        {"type": "rdf"},
+    ],
+    "transformers": [{"type": "bodc-transformer"}],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
+}
diff --git a/invenio_vocabularies/contrib/subjects/config.py b/invenio_vocabularies/contrib/subjects/config.py
@@ -34,6 +34,10 @@
     lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
 )
 
+bodc_puv_file_url = LocalProxy(
+    lambda: current_app.config["VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL"]
+)
+
 
 class SubjectsSearchOptions(SearchOptions):
     """Search options."""

diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py
@@ -12,6 +12,7 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
+from .bodc import datastreams as bodc_datastreams
 from .euroscivoc import datastreams as euroscivoc_datastreams
 from .gemet import datastreams as gemet_datastreams
 from .mesh import datastreams as mesh_datastreams
@@ -32,23 +33,20 @@ def _entry_id(self, entry):
 
 VOCABULARIES_DATASTREAM_READERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
-    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
-    **gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
 }
 """Subjects Data Streams readers."""
 
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
     **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
     **gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **bodc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
 }
 """Subjects Data Streams transformers."""
 
 VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
     **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
-    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
-    **gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
 }
 """Subjects Data Streams writers."""
 

diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -14,7 +14,36 @@
 
 
 class EuroSciVocSubjectsTransformer(RDFTransformer):
-    """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+    """
+     Transformer class to convert EuroSciVoc RDF data to a dictionary format.
+
+     Input:
+         - Relevant fields:
+             - `skos:notation`: Primary identifier for the concept.
+             - `skos:prefLabel`: Preferred labels with language codes.
+             - `skos:altLabel`: Alternative labels.
+             - `skos:broader`: Broader concepts that this concept belongs to.
+
+    Output:
+         {
+             "id": "euroscivoc:1717",  # EuroSciVoc-specific concept ID (skos:notation).
+             "scheme": "EuroSciVoc",  # The scheme name indicating this is a EuroSciVoc concept.
+             "subject": "Satellite radio",  # The primary subject label (first preferred label in English, skos:prefLabel).
+             "title": {
+                 "it": "Radio satellitare",  # Italian preferred label (skos:prefLabel).
+                 "en": "Satellite radio",  # English preferred label (skos:prefLabel).
+             },
+             "props": {
+                 "parents": "euroscivoc:1225",  # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
+             },
+             "identifiers": [
+                 {
+                     "scheme": "url",  # Type of identifier (URL).
+                     "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",  # URI of the concept (rdf:about).
+                 }
+             ],
+         }
+    """
 
     def _get_notation(self, subject, rdf_graph):
         """Extract the numeric notation for a subject."""
@@ -38,21 +67,18 @@ def _transform_entry(self, subject, rdf_graph):
             for n in reversed(self._find_parents(subject, rdf_graph))
             if n
         )
-        identifiers = [{"scheme": "url", "identifier": str(subject)}]
 
         return {
             "id": id,
             "scheme": "EuroSciVoc",
             "subject": labels.get("en", "").capitalize(),
             "title": labels,
             "props": {"parents": parents} if parents else {},
-            "identifiers": identifiers,
+            "identifiers": self._get_identifiers(subject),
         }
 
 
-# Configuration for datastream transformers, and writers
-VOCABULARIES_DATASTREAM_READERS = {}
-VOCABULARIES_DATASTREAM_WRITERS = {}
+# Configuration for datastream
 
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
     "euroscivoc-transformer": EuroSciVocSubjectsTransformer
@@ -71,9 +97,5 @@ def _transform_entry(self, subject, rdf_graph):
         },
     ],
     "transformers": [{"type": "euroscivoc-transformer"}],
-    "writers": [
-        {
-            "type": "subjects-service",
-        }
-    ],
+    "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
 }
diff --git a/invenio_vocabularies/contrib/subjects/gemet/datastreams.py b/invenio_vocabularies/contrib/subjects/gemet/datastreams.py
@@ -20,7 +20,40 @@
 
 
 class GEMETSubjectsTransformer(RDFTransformer):
-    """Transformer class to convert GEMET RDF data to a dictionary format."""
+    """
+    Transformer class to convert GEMET RDF data to a dictionary format.
+
+    Input:
+        - Relevant fields:
+            - `skos:prefLabel`: Preferred labels with language codes.
+            - `skos:broader`: References to broader concepts (parent concepts).
+            - `skos:memberOf`: References to groups or themes the concept belongs to.
+
+    Output:
+        - A dictionary with the following structure:
+            {
+                "id": "gemet:concept/10008",  # GEMET-specific concept ID (skos:Concept).
+                "scheme": "GEMET",  # The scheme name indicating this is a GEMET concept.
+                "subject": "Consumer product",  # The subject label (first preferred label in English, skos:prefLabel).
+                "title": {
+                    "en": "Consumer product",  # English label for the concept (skos:prefLabel).
+                    "ar": "منتج استهلاكي"  # Arabic label for the concept (skos:prefLabel).
+                },
+                "props": {
+                    "parents": "gemet:concept/6660",  # The parent concept (skos:broader), identified by its GEMET Concept ID.
+                    "groups": ["http://www.eionet.europa.eu/gemet/group/10112"],  # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
+                    "themes": [
+                        "http://www.eionet.europa.eu/gemet/theme/27",  # Theme the concept belongs to (skos:memberOf)(rdfs:label).
+                    ]
+                },
+                "identifiers": [
+                    {
+                        "scheme": "url",  # Type of identifier (URL).
+                        "identifier": "http://www.eionet.europa.eu/gemet/concept/10008"  # URI of the concept (rdf:about).
+                    }
+                ]
+            }
+    """
 
     def _get_parent_notation(self, broader, rdf_graph):
         """Extract parent notation from GEMET URI."""
@@ -83,13 +116,11 @@ def _transform_entry(self, subject, rdf_graph):
             "subject": labels.get("en", "").capitalize(),
             "title": labels,
             "props": props,
-            "identifiers": identifiers,
+            "identifiers": self._get_identifiers(subject),
         }
 
 
-# Configuration for datastream transformers, and writers
-VOCABULARIES_DATASTREAM_READERS = {}
-VOCABULARIES_DATASTREAM_WRITERS = {}
+# Configuration for datastream
 
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
 

diff --git a/invenio_vocabularies/datastreams/transformers.py b/invenio_vocabularies/datastreams/transformers.py
@@ -9,6 +9,7 @@
 """Transformers module."""
 
 from abc import ABC, abstractmethod
+from urllib.parse import urlparse
 
 from lxml import etree
 
@@ -76,6 +77,17 @@ def skos_core(self):
         """Get the SKOS core namespace."""
         return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
 
+    def _validate_subject_url(self, subject):
+        """Check if the subject is a valid URL."""
+        parsed = urlparse(str(subject))
+        return bool(parsed.netloc and parsed.scheme)
+
+    def _get_identifiers(self, subject):
+        """Generate identifiers field for a valid subject URL."""
+        if self._validate_subject_url(subject):
+            return [{"scheme": "url", "identifier": str(subject)}]
+        return []
+
     def _get_labels(self, subject, rdf_graph):
         """Extract labels (prefLabel or altLabel) for a subject."""
         labels = {

diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py
@@ -28,7 +28,12 @@
 )
 from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
 from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
+from .contrib.subjects.bodc.datastreams import DATASTREAM_CONFIG as bodc_ds_config
 from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config
+from .contrib.subjects.euroscivoc.datastreams import (
+    DATASTREAM_CONFIG as euroscivoc_ds_config,
+)
+from .contrib.subjects.gemet.datastreams import DATASTREAM_CONFIG as gemet_ds_config
 
 
 class VocabularyConfig:
@@ -137,6 +142,39 @@ def get_service(self):
         raise NotImplementedError("Service not implemented for EDMO Affiliations")
 
 
+class SubjectsEuroSciVocVocabularyConfig(VocabularyConfig):
+    """EuroSciVoc Subjects Vocabulary Config."""
+
+    config = euroscivoc_ds_config
+    vocabulary_name = "subjects:euroscivoc"
+
+    def get_service(self):
+        """Get the service for the vocabulary."""
+        raise NotImplementedError("Service not implemented for EuroSciVoc Subjects")
+
+
+class SubjectsGEMETVocabularyConfig(VocabularyConfig):
+    """GEMET Subjects Vocabulary Config."""
+
+    config = gemet_ds_config
+    vocabulary_name = "subjects:gemet"
+
+    def get_service(self):
+        """Get the service for the vocabulary."""
+        raise NotImplementedError("Service not implemented for GEMET Subjects")
+
+
+class SubjectsBODCVocabularyConfig(VocabularyConfig):
+    """BODC Subjects Vocabulary Config."""
+
+    config = bodc_ds_config
+    vocabulary_name = "subjects:bodc-puv"
+
+    def get_service(self):
+        """Get the service for the vocabulary."""
+        raise NotImplementedError("Service not implemented for BODC Subjects")
+
+
 def get_vocabulary_config(vocabulary):
     """Factory function to get the appropriate Vocabulary Config."""
     vocab_config = {
@@ -148,5 +186,8 @@ def get_vocabulary_config(vocabulary):
         "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
         "affiliations:edmo": AffiliationsEDMOVocabularyConfig,
         "subjects": SubjectsVocabularyConfig,
+        "subjects:gemet": SubjectsGEMETVocabularyConfig,
+        "subjects:bodc": SubjectsBODCVocabularyConfig,
+        "subjects:euroscivoc": SubjectsEuroSciVocVocabularyConfig,
     }
     return vocab_config.get(vocabulary, VocabularyConfig)()