diff --git a/invenio_vocabularies/contrib/subjects/bodc/datastreams.py b/invenio_vocabularies/contrib/subjects/bodc/datastreams.py
index b2f6545b..2a97f880 100644
--- a/invenio_vocabularies/contrib/subjects/bodc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/bodc/datastreams.py
@@ -17,7 +17,37 @@
class BODCPUVSubjectsTransformer(RDFTransformer):
- """Transformer class to convert BODC-PUV RDF data to a dictionary format."""
+ """
+ Transformer class to convert BODC-PUV RDF data to a dictionary format.
+
+ Input:
+ - Relevant fields:
+ - `skos:notation`: Primary identifier for the concept.
+ - `skos:prefLabel`: Preferred labels with language codes.
+ - `skos:altLabel`: Alternative labels (optional).
+ - `skos:definition`: Definitions of the concept.
+ - `owl:deprecated`: Boolean flag indicating if the concept is deprecated.
+
+ Output:
+ - A dictionary with the following structure:
+ {
+ "id": "SDN:P01::SAGEMSFM", # BODC-specific parameter ID (skos:notation).
+ "scheme": "BODC-PUV", # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
+ "subject": "AMSSedAge", # The alternative label (skos:altLabel), if available, or None.
+ "title": {
+ "en": "14C age of Foraminiferida" # English preferred label (skos:prefLabel).
+ },
+ "props": {
+ "definitions": "Accelerated mass spectrometry on picked tests", # Definition of subject (skos:definition).
+ },
+ "identifiers": [
+ {
+ "scheme": "url", # Type of identifier (URL).
+ "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM" # URI of the concept.
+ }
+ ]
+ }
+ """
def _get_subject_data(self, rdf_graph, subject):
"""Fetch all triples for a subject and organize them into a dictionary."""
@@ -31,32 +61,26 @@ def _get_subject_data(self, rdf_graph, subject):
def _transform_entry(self, subject, rdf_graph):
"""Transform an entry to the required dictionary format."""
+ labels = self._get_labels(subject, rdf_graph)
subject_data = self._get_subject_data(rdf_graph, subject)
deprecated = subject_data.get(str(OWL.deprecated), [False])
if deprecated and str(deprecated[0]).lower() == "true":
return None # Skip deprecated subjects
notation = subject_data.get(str(self.skos_core.notation), [])
- id = notation[0] if notation else None
+ id = str(notation[0]) if notation else None
- labels = {
- obj.language: obj.value.capitalize()
- for obj in subject_data.get(str(self.skos_core.prefLabel), [])
- if obj.language and "-" not in obj.language
- }
alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
- subject_text = alt_labels[0] if alt_labels else None
-
- identifiers = [{"scheme": "url", "identifier": str(subject)}]
- props = {}
+ subject_text = str(alt_labels[0]) if alt_labels else ""
+ definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])
return {
"id": id,
"scheme": "BODC-PUV",
"subject": subject_text,
"title": labels,
- "props": props,
- "identifiers": identifiers,
+ "props": {"definition": definition} if definition else {},
+ "identifiers": self._get_identifiers(subject),
}
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
index 656cb591..337c340b 100644
--- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -14,7 +14,36 @@
class EuroSciVocSubjectsTransformer(RDFTransformer):
- """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+ """
+ Transformer class to convert EuroSciVoc RDF data to a dictionary format.
+
+ Input:
+ - Relevant fields:
+ - `skos:notation`: Primary identifier for the concept.
+ - `skos:prefLabel`: Preferred labels with language codes.
+ - `skos:altLabel`: Alternative labels.
+ - `skos:broader`: Broader concepts that this concept belongs to.
+
+ Output:
+ {
+ "id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
+ "scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
+ "subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
+ "title": {
+ "it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
+ "en": "Satellite radio", # English preferred label (skos:prefLabel).
+ },
+ "props": {
+ "parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
+ },
+ "identifiers": [
+ {
+ "scheme": "url", # Type of identifier (URL).
+ "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
+ }
+ ],
+ }
+ """
def _get_notation(self, subject, rdf_graph):
"""Extract the numeric notation for a subject."""
@@ -38,7 +67,6 @@ def _transform_entry(self, subject, rdf_graph):
for n in reversed(self._find_parents(subject, rdf_graph))
if n
)
- identifiers = [{"scheme": "url", "identifier": str(subject)}]
return {
"id": id,
@@ -46,7 +74,7 @@ def _transform_entry(self, subject, rdf_graph):
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": {"parents": parents} if parents else {},
- "identifiers": identifiers,
+ "identifiers": self._get_identifiers(subject),
}
diff --git a/invenio_vocabularies/contrib/subjects/gemet/datastreams.py b/invenio_vocabularies/contrib/subjects/gemet/datastreams.py
index e00f9d1c..5e29f105 100644
--- a/invenio_vocabularies/contrib/subjects/gemet/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/gemet/datastreams.py
@@ -20,7 +20,40 @@
class GEMETSubjectsTransformer(RDFTransformer):
- """Transformer class to convert GEMET RDF data to a dictionary format."""
+ """
+ Transformer class to convert GEMET RDF data to a dictionary format.
+
+ Input:
+ - Relevant fields:
+ - `skos:prefLabel`: Preferred labels with language codes.
+ - `skos:broader`: References to broader concepts (parent concepts).
+ - `skos:memberOf`: References to groups or themes the concept belongs to.
+
+ Output:
+ - A dictionary with the following structure:
+ {
+ "id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
+ "scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
+ "subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
+ "title": {
+ "en": "Consumer product", # English label for the concept (skos:prefLabel).
+ "ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
+ },
+ "props": {
+ "parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
+ "groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
+ "themes": [
+ "http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
+ ]
+ },
+ "identifiers": [
+ {
+ "scheme": "url", # Type of identifier (URL).
+ "identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
+ }
+ ]
+ }
+ """
def _get_parent_notation(self, broader, rdf_graph):
"""Extract parent notation from GEMET URI."""
@@ -83,7 +116,7 @@ def _transform_entry(self, subject, rdf_graph):
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": props,
- "identifiers": identifiers,
+ "identifiers": self._get_identifiers(subject),
}
diff --git a/invenio_vocabularies/datastreams/transformers.py b/invenio_vocabularies/datastreams/transformers.py
index fde3ac2c..f33eb262 100644
--- a/invenio_vocabularies/datastreams/transformers.py
+++ b/invenio_vocabularies/datastreams/transformers.py
@@ -9,6 +9,7 @@
"""Transformers module."""
from abc import ABC, abstractmethod
+from urllib.parse import urlparse
from lxml import etree
@@ -76,6 +77,17 @@ def skos_core(self):
"""Get the SKOS core namespace."""
return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
+ def _validate_subject_url(self, subject):
+ """Check if the subject is a valid URL."""
+ parsed = urlparse(str(subject))
+ return bool(parsed.netloc and parsed.scheme)
+
+ def _get_identifiers(self, subject):
+ """Generate identifiers field for a valid subject URL."""
+ if self._validate_subject_url(subject):
+ return [{"scheme": "url", "identifier": str(subject)}]
+ return []
+
def _get_labels(self, subject, rdf_graph):
"""Extract labels (prefLabel or altLabel) for a subject."""
labels = {
diff --git a/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py b/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py
new file mode 100644
index 00000000..765e1ac0
--- /dev/null
+++ b/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+import io
+
+import pytest
+from rdflib import Graph
+
+from invenio_vocabularies.contrib.subjects.bodc.datastreams import (
+ BODCPUVSubjectsTransformer,
+)
+from invenio_vocabularies.datastreams.datastreams import StreamEntry
+from invenio_vocabularies.datastreams.readers import RDFReader
+
+XML_DATA = bytes(
+ """
+
+ BODC Parameter Usage Vocabulary
+ BODC Parameter Usage Vocabulary
+ BODC PUV
+ BODC PUV
+ Terms built using the BODC parameter semantic model designed to describe individual measured phenomena. May be used to mark up sets of data such as a NetCDF array or spreadsheet column. Units must be specified when using a P01 code. The P06 unit that is linked to individual P01 in the NVS is the one used in BODC's systems but external users can use any appropriate units.
+
+
+
+ SDN:P01::SAGEMSFM
+ SDN:P01::SAGEMSFM
+ 2008-10-16 16:27:06.0
+ SDN:P01::SAGEMSFM
+ 14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry
+ AMSSedAge
+ Accelerated mass spectrometry on picked tests
+ 1
+
+ 1
+ 2008-10-16 16:27:06.0
+ accepted
+ false
+
+
+
+
+
+
+
+
+""",
+ encoding="utf-8",
+)
+
+
+@pytest.fixture(scope="module")
+def expected_from_rdf():
+ return [
+ {
+ "id": "SDN:P01::SAGEMSFM",
+ "scheme": "BODC-PUV",
+ "subject": "AMSSedAge",
+ "title": {
+ "en": "14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry",
+ },
+ "props": {"definition": "Accelerated mass spectrometry on picked tests"},
+ "identifiers": [
+ {
+ "scheme": "url",
+ "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/",
+ }
+ ],
+ }
+ ]
+
+
+def test_gemet_concept_transformer_pref_label(expected_from_rdf):
+ reader = RDFReader()
+ rdf_data = io.BytesIO(XML_DATA)
+ rdf_graph = Graph()
+ rdf_graph.parse(rdf_data, format="xml")
+ stream_entries = list(reader._iter(rdf_graph))
+ assert len(stream_entries) > 0
+ transformer = BODCPUVSubjectsTransformer()
+ result = []
+ for entry in stream_entries:
+ entry = transformer.apply(StreamEntry(entry)).entry
+ result.append(entry)
+ assert expected_from_rdf == result
diff --git a/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py b/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py
index 6a834581..a9be9c3f 100644
--- a/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py
+++ b/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py
@@ -25,24 +25,24 @@
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xml:base="http://www.eionet.europa.eu/gemet/">
-
-
- Consumer product
- منتج استهلاكي
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+ Consumer product
+ منتج استهلاكي
+
+
+
+
+
+
+
+
+
+
+
+
+
+
""",
encoding="utf-8",
)