diff --git a/invenio_vocabularies/contrib/subjects/bodc/datastreams.py b/invenio_vocabularies/contrib/subjects/bodc/datastreams.py index b2f6545b..2a97f880 100644 --- a/invenio_vocabularies/contrib/subjects/bodc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/bodc/datastreams.py @@ -17,7 +17,37 @@ class BODCPUVSubjectsTransformer(RDFTransformer): - """Transformer class to convert BODC-PUV RDF data to a dictionary format.""" + """ + Transformer class to convert BODC-PUV RDF data to a dictionary format. + + Input: + - Relevant fields: + - `skos:notation`: Primary identifier for the concept. + - `skos:prefLabel`: Preferred labels with language codes. + - `skos:altLabel`: Alternative labels (optional). + - `skos:definition`: Definitions of the concept. + - `owl:deprecated`: Boolean flag indicating if the concept is deprecated. + + Output: + - A dictionary with the following structure: + { + "id": "SDN:P01::SAGEMSFM", # BODC-specific parameter ID (skos:notation). + "scheme": "BODC-PUV", # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept. + "subject": "AMSSedAge", # The alternative label (skos:altLabel), if available, or None. + "title": { + "en": "14C age of Foraminiferida" # English preferred label (skos:prefLabel). + }, + "props": { + "definitions": "Accelerated mass spectrometry on picked tests", # Definition of subject (skos:definition). + }, + "identifiers": [ + { + "scheme": "url", # Type of identifier (URL). + "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM" # URI of the concept. + } + ] + } + """ def _get_subject_data(self, rdf_graph, subject): """Fetch all triples for a subject and organize them into a dictionary.""" @@ -31,32 +61,26 @@ def _get_subject_data(self, rdf_graph, subject): def _transform_entry(self, subject, rdf_graph): """Transform an entry to the required dictionary format.""" + labels = self._get_labels(subject, rdf_graph) subject_data = self._get_subject_data(rdf_graph, subject) deprecated = subject_data.get(str(OWL.deprecated), [False]) if deprecated and str(deprecated[0]).lower() == "true": return None # Skip deprecated subjects notation = subject_data.get(str(self.skos_core.notation), []) - id = notation[0] if notation else None + id = str(notation[0]) if notation else None - labels = { - obj.language: obj.value.capitalize() - for obj in subject_data.get(str(self.skos_core.prefLabel), []) - if obj.language and "-" not in obj.language - } alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])] - subject_text = alt_labels[0] if alt_labels else None - - identifiers = [{"scheme": "url", "identifier": str(subject)}] - props = {} + subject_text = str(alt_labels[0]) if alt_labels else "" + definition = str(subject_data.get(str(self.skos_core.definition), [None])[0]) return { "id": id, "scheme": "BODC-PUV", "subject": subject_text, "title": labels, - "props": props, - "identifiers": identifiers, + "props": {"definition": definition} if definition else {}, + "identifiers": self._get_identifiers(subject), } diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 656cb591..337c340b 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -14,7 +14,36 @@ class EuroSciVocSubjectsTransformer(RDFTransformer): - """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" + """ + Transformer class to convert EuroSciVoc RDF data to a dictionary format. + + Input: + - Relevant fields: + - `skos:notation`: Primary identifier for the concept. + - `skos:prefLabel`: Preferred labels with language codes. + - `skos:altLabel`: Alternative labels. + - `skos:broader`: Broader concepts that this concept belongs to. + + Output: + { + "id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation). + "scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept. + "subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel). + "title": { + "it": "Radio satellitare", # Italian preferred label (skos:prefLabel). + "en": "Satellite radio", # English preferred label (skos:prefLabel). + }, + "props": { + "parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID. + }, + "identifiers": [ + { + "scheme": "url", # Type of identifier (URL). + "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about). + } + ], + } + """ def _get_notation(self, subject, rdf_graph): """Extract the numeric notation for a subject.""" @@ -38,7 +67,6 @@ def _transform_entry(self, subject, rdf_graph): for n in reversed(self._find_parents(subject, rdf_graph)) if n ) - identifiers = [{"scheme": "url", "identifier": str(subject)}] return { "id": id, @@ -46,7 +74,7 @@ def _transform_entry(self, subject, rdf_graph): "subject": labels.get("en", "").capitalize(), "title": labels, "props": {"parents": parents} if parents else {}, - "identifiers": identifiers, + "identifiers": self._get_identifiers(subject), } diff --git a/invenio_vocabularies/contrib/subjects/gemet/datastreams.py b/invenio_vocabularies/contrib/subjects/gemet/datastreams.py index e00f9d1c..5e29f105 100644 --- a/invenio_vocabularies/contrib/subjects/gemet/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/gemet/datastreams.py @@ -20,7 +20,40 @@ class GEMETSubjectsTransformer(RDFTransformer): - """Transformer class to convert GEMET RDF data to a dictionary format.""" + """ + Transformer class to convert GEMET RDF data to a dictionary format. + + Input: + - Relevant fields: + - `skos:prefLabel`: Preferred labels with language codes. + - `skos:broader`: References to broader concepts (parent concepts). + - `skos:memberOf`: References to groups or themes the concept belongs to. + + Output: + - A dictionary with the following structure: + { + "id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept). + "scheme": "GEMET", # The scheme name indicating this is a GEMET concept. + "subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel). + "title": { + "en": "Consumer product", # English label for the concept (skos:prefLabel). + "ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel). + }, + "props": { + "parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID. + "groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel). + "themes": [ + "http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label). + ] + }, + "identifiers": [ + { + "scheme": "url", # Type of identifier (URL). + "identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about). + } + ] + } + """ def _get_parent_notation(self, broader, rdf_graph): """Extract parent notation from GEMET URI.""" @@ -83,7 +116,7 @@ def _transform_entry(self, subject, rdf_graph): "subject": labels.get("en", "").capitalize(), "title": labels, "props": props, - "identifiers": identifiers, + "identifiers": self._get_identifiers(subject), } diff --git a/invenio_vocabularies/datastreams/transformers.py b/invenio_vocabularies/datastreams/transformers.py index fde3ac2c..f33eb262 100644 --- a/invenio_vocabularies/datastreams/transformers.py +++ b/invenio_vocabularies/datastreams/transformers.py @@ -9,6 +9,7 @@ """Transformers module.""" from abc import ABC, abstractmethod +from urllib.parse import urlparse from lxml import etree @@ -76,6 +77,17 @@ def skos_core(self): """Get the SKOS core namespace.""" return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") + def _validate_subject_url(self, subject): + """Check if the subject is a valid URL.""" + parsed = urlparse(str(subject)) + return bool(parsed.netloc and parsed.scheme) + + def _get_identifiers(self, subject): + """Generate identifiers field for a valid subject URL.""" + if self._validate_subject_url(subject): + return [{"scheme": "url", "identifier": str(subject)}] + return [] + def _get_labels(self, subject, rdf_graph): """Extract labels (prefLabel or altLabel) for a subject.""" labels = { diff --git a/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py b/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py new file mode 100644 index 00000000..765e1ac0 --- /dev/null +++ b/tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +import io + +import pytest +from rdflib import Graph + +from invenio_vocabularies.contrib.subjects.bodc.datastreams import ( + BODCPUVSubjectsTransformer, +) +from invenio_vocabularies.datastreams.datastreams import StreamEntry +from invenio_vocabularies.datastreams.readers import RDFReader + +XML_DATA = bytes( + """ + + BODC Parameter Usage Vocabulary + BODC Parameter Usage Vocabulary + BODC PUV + BODC PUV + Terms built using the BODC parameter semantic model designed to describe individual measured phenomena. May be used to mark up sets of data such as a NetCDF array or spreadsheet column. Units must be specified when using a P01 code. The P06 unit that is linked to individual P01 in the NVS is the one used in BODC's systems but external users can use any appropriate units. + + + + SDN:P01::SAGEMSFM + SDN:P01::SAGEMSFM + 2008-10-16 16:27:06.0 + SDN:P01::SAGEMSFM + 14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry + AMSSedAge + Accelerated mass spectrometry on picked tests + 1 + + 1 + 2008-10-16 16:27:06.0 + accepted + false + + + + + + + + +""", + encoding="utf-8", +) + + +@pytest.fixture(scope="module") +def expected_from_rdf(): + return [ + { + "id": "SDN:P01::SAGEMSFM", + "scheme": "BODC-PUV", + "subject": "AMSSedAge", + "title": { + "en": "14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry", + }, + "props": {"definition": "Accelerated mass spectrometry on picked tests"}, + "identifiers": [ + { + "scheme": "url", + "identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/", + } + ], + } + ] + + +def test_gemet_concept_transformer_pref_label(expected_from_rdf): + reader = RDFReader() + rdf_data = io.BytesIO(XML_DATA) + rdf_graph = Graph() + rdf_graph.parse(rdf_data, format="xml") + stream_entries = list(reader._iter(rdf_graph)) + assert len(stream_entries) > 0 + transformer = BODCPUVSubjectsTransformer() + result = [] + for entry in stream_entries: + entry = transformer.apply(StreamEntry(entry)).entry + result.append(entry) + assert expected_from_rdf == result diff --git a/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py b/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py index 6a834581..a9be9c3f 100644 --- a/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py +++ b/tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py @@ -25,24 +25,24 @@ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xml:base="http://www.eionet.europa.eu/gemet/"> - - - Consumer product - منتج استهلاكي - - - - - - - - - - - - - - + + + Consumer product + منتج استهلاكي + + + + + + + + + + + + + + """, encoding="utf-8", )