Skip to content

Commit

Permalink
subjects: minor refactor,updated docstrings and test
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Nov 28, 2024
1 parent c46de43 commit f9e30ae
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 36 deletions.
50 changes: 37 additions & 13 deletions invenio_vocabularies/contrib/subjects/bodc/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,37 @@


class BODCPUVSubjectsTransformer(RDFTransformer):
"""Transformer class to convert BODC-PUV RDF data to a dictionary format."""
"""
Transformer class to convert BODC-PUV RDF data to a dictionary format.
Input:
- Relevant fields:
- `skos:notation`: Primary identifier for the concept.
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:altLabel`: Alternative labels (optional).
- `skos:definition`: Definitions of the concept.
- `owl:deprecated`: Boolean flag indicating if the concept is deprecated.
Output:
- A dictionary with the following structure:
{
"id": "SDN:P01::SAGEMSFM", # BODC-specific parameter ID (skos:notation).
"scheme": "BODC-PUV", # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
"subject": "AMSSedAge", # The alternative label (skos:altLabel), if available, or None.
"title": {
"en": "14C age of Foraminiferida" # English preferred label (skos:prefLabel).
},
"props": {
"definitions": "Accelerated mass spectrometry on picked tests", # Definition of subject (skos:definition).
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM" # URI of the concept.
}
]
}
"""

def _get_subject_data(self, rdf_graph, subject):
"""Fetch all triples for a subject and organize them into a dictionary."""
Expand All @@ -31,32 +61,26 @@ def _get_subject_data(self, rdf_graph, subject):

def _transform_entry(self, subject, rdf_graph):
"""Transform an entry to the required dictionary format."""
labels = self._get_labels(subject, rdf_graph)
subject_data = self._get_subject_data(rdf_graph, subject)
deprecated = subject_data.get(str(OWL.deprecated), [False])
if deprecated and str(deprecated[0]).lower() == "true":
return None # Skip deprecated subjects

notation = subject_data.get(str(self.skos_core.notation), [])
id = notation[0] if notation else None
id = str(notation[0]) if notation else None

labels = {
obj.language: obj.value.capitalize()
for obj in subject_data.get(str(self.skos_core.prefLabel), [])
if obj.language and "-" not in obj.language
}
alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
subject_text = alt_labels[0] if alt_labels else None

identifiers = [{"scheme": "url", "identifier": str(subject)}]
props = {}
subject_text = str(alt_labels[0]) if alt_labels else ""
definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])

return {
"id": id,
"scheme": "BODC-PUV",
"subject": subject_text,
"title": labels,
"props": props,
"identifiers": identifiers,
"props": {"definition": definition} if definition else {},
"identifiers": self._get_identifiers(subject),
}


Expand Down
34 changes: 31 additions & 3 deletions invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,36 @@


class EuroSciVocSubjectsTransformer(RDFTransformer):
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
"""
Transformer class to convert EuroSciVoc RDF data to a dictionary format.
Input:
- Relevant fields:
- `skos:notation`: Primary identifier for the concept.
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:altLabel`: Alternative labels.
- `skos:broader`: Broader concepts that this concept belongs to.
Output:
{
"id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
"scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
"subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
"title": {
"it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
"en": "Satellite radio", # English preferred label (skos:prefLabel).
},
"props": {
"parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
}
],
}
"""

def _get_notation(self, subject, rdf_graph):
"""Extract the numeric notation for a subject."""
Expand All @@ -38,15 +67,14 @@ def _transform_entry(self, subject, rdf_graph):
for n in reversed(self._find_parents(subject, rdf_graph))
if n
)
identifiers = [{"scheme": "url", "identifier": str(subject)}]

return {
"id": id,
"scheme": "EuroSciVoc",
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": {"parents": parents} if parents else {},
"identifiers": identifiers,
"identifiers": self._get_identifiers(subject),
}


Expand Down
37 changes: 35 additions & 2 deletions invenio_vocabularies/contrib/subjects/gemet/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,40 @@


class GEMETSubjectsTransformer(RDFTransformer):
"""Transformer class to convert GEMET RDF data to a dictionary format."""
"""
Transformer class to convert GEMET RDF data to a dictionary format.
Input:
- Relevant fields:
- `skos:prefLabel`: Preferred labels with language codes.
- `skos:broader`: References to broader concepts (parent concepts).
- `skos:memberOf`: References to groups or themes the concept belongs to.
Output:
- A dictionary with the following structure:
{
"id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
"scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
"subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
"title": {
"en": "Consumer product", # English label for the concept (skos:prefLabel).
"ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
},
"props": {
"parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
"groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
"themes": [
"http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
]
},
"identifiers": [
{
"scheme": "url", # Type of identifier (URL).
"identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
}
]
}
"""

def _get_parent_notation(self, broader, rdf_graph):
"""Extract parent notation from GEMET URI."""
Expand Down Expand Up @@ -83,7 +116,7 @@ def _transform_entry(self, subject, rdf_graph):
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": props,
"identifiers": identifiers,
"identifiers": self._get_identifiers(subject),
}


Expand Down
12 changes: 12 additions & 0 deletions invenio_vocabularies/datastreams/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""Transformers module."""

from abc import ABC, abstractmethod
from urllib.parse import urlparse

from lxml import etree

Expand Down Expand Up @@ -76,6 +77,17 @@ def skos_core(self):
"""Get the SKOS core namespace."""
return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")

def _validate_subject_url(self, subject):
"""Check if the subject is a valid URL."""
parsed = urlparse(str(subject))
return bool(parsed.netloc and parsed.scheme)

def _get_identifiers(self, subject):
"""Generate identifiers field for a valid subject URL."""
if self._validate_subject_url(subject):
return [{"scheme": "url", "identifier": str(subject)}]
return []

def _get_labels(self, subject, rdf_graph):
"""Extract labels (prefLabel or altLabel) for a subject."""
labels = {
Expand Down
90 changes: 90 additions & 0 deletions tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

import io

import pytest
from rdflib import Graph

from invenio_vocabularies.contrib.subjects.bodc.datastreams import (
BODCPUVSubjectsTransformer,
)
from invenio_vocabularies.datastreams.datastreams import StreamEntry
from invenio_vocabularies.datastreams.readers import RDFReader

XML_DATA = bytes(
"""<?xml version="1.0" encoding="UTF-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:dc="http://purl.org/dc/terms/" xmlns:dce="http://purl.org/dc/elements/1.1/" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:grg="http://www.isotc211.org/schemas/grg/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:void="http://rdfs.org/ns/void#" xmlns:pav="http://purl.org/pav/" xmlns:prov="https://www.w3.org/ns/prov#" xmlns:reg="http://purl.org/linked-data/registry#" xmlns:cpm="http://purl.org/voc/cpm#" xmlns:qudt="https://qudt.org/2.1/schema/qudt#" xmlns:semapv="http://w3id.org/semapv/vocab/" xmlns:iop="https://w3id.org/iadopt/ont#" xmlns:sssom="https://w3id.org/sssom/schema/" xmlns:puv="https://w3id.org/env/puv#">
<skos:Collection rdf:about="http://vocab.nerc.ac.uk/collection/P01/current/">
<skos:prefLabel>BODC Parameter Usage Vocabulary</skos:prefLabel>
<dc:title>BODC Parameter Usage Vocabulary</dc:title>
<skos:altLabel>BODC PUV</skos:altLabel>
<dc:alternative>BODC PUV</dc:alternative>
<dc:description>Terms built using the BODC parameter semantic model designed to describe individual measured phenomena. May be used to mark up sets of data such as a NetCDF array or spreadsheet column. Units must be specified when using a P01 code. The P06 unit that is linked to individual P01 in the NVS is the one used in BODC's systems but external users can use any appropriate units.</dc:description>
<dc:license rdf:resource="https://creativecommons.org/licenses/by/4.0/"/>
<skos:member>
<skos:Concept xml:base="http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/" rdf:about="http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/">
<dc:identifier>SDN:P01::SAGEMSFM</dc:identifier>
<dce:identifier>SDN:P01::SAGEMSFM</dce:identifier>
<dc:date>2008-10-16 16:27:06.0</dc:date>
<skos:notation>SDN:P01::SAGEMSFM</skos:notation>
<skos:prefLabel xml:lang="en">14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry</skos:prefLabel>
<skos:altLabel>AMSSedAge</skos:altLabel>
<skos:definition xml:lang="en">Accelerated mass spectrometry on picked tests</skos:definition>
<owl:versionInfo>1</owl:versionInfo>
<pav:hasCurrentVersion rdf:resource="http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/1/"/>
<pav:version>1</pav:version>
<pav:authoredOn>2008-10-16 16:27:06.0</pav:authoredOn>
<skos:note xml:lang="en">accepted</skos:note>
<owl:deprecated>false</owl:deprecated>
<iop:hasMatrix rdf:resource="http://vocab.nerc.ac.uk/collection/S21/current/S21S022/"/>
<skos:related rdf:resource="http://vocab.nerc.ac.uk/collection/P06/current/UYBP/"/>
<skos:broader rdf:resource="http://vocab.nerc.ac.uk/collection/S25/current/BE002325/"/>
<skos:broader rdf:resource="http://vocab.nerc.ac.uk/collection/S26/current/MAT00136/"/>
<void:inDataset rdf:resource="http://vocab.nerc.ac.uk/.well-known/void"/>
</skos:Concept>
</skos:member>
</skos:Collection>
</rdf:RDF>""",
encoding="utf-8",
)


@pytest.fixture(scope="module")
def expected_from_rdf():
return [
{
"id": "SDN:P01::SAGEMSFM",
"scheme": "BODC-PUV",
"subject": "AMSSedAge",
"title": {
"en": "14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry",
},
"props": {"definition": "Accelerated mass spectrometry on picked tests"},
"identifiers": [
{
"scheme": "url",
"identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/",
}
],
}
]


def test_gemet_concept_transformer_pref_label(expected_from_rdf):
reader = RDFReader()
rdf_data = io.BytesIO(XML_DATA)
rdf_graph = Graph()
rdf_graph.parse(rdf_data, format="xml")
stream_entries = list(reader._iter(rdf_graph))
assert len(stream_entries) > 0
transformer = BODCPUVSubjectsTransformer()
result = []
for entry in stream_entries:
entry = transformer.apply(StreamEntry(entry)).entry
result.append(entry)
assert expected_from_rdf == result

Check failure on line 90 in tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py

View workflow job for this annotation

GitHub Actions / Python / Tests (3.9, postgresql14, opensearch2)

test_gemet_concept_transformer_pref_label AssertionError: assert [{'id': 'SDN:...DC-PUV', ...}] == [{'id': 'SDN:...DC-PUV', ...}] At index 0 diff: {'id': 'SDN:P01::SAGEMSFM', 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', 'title': {'en': '14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry'}, 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'identifiers': [{'scheme': 'url', 'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/'}]} != {'id': 'SDN:P01::SAGEMSFM', 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', 'title': {'en': '14c age of foraminiferida (itis: 44030: worms 22528) [subcomponent: tests] in sediment by picking and accelerator mass spectrometry'}, 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'identifiers': [{'scheme': 'url', 'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/'}]} Full diff: [ {'id': 'SDN:P01::SAGEMSFM', 'identifiers': [{'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/', 'scheme': 'url'}], 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', - 'title': {'en': '14c age of foraminiferida (itis: 44030: worms 22528) ' ? ^ ^ ^^^^ ^ ^^^ + 'title': {'en': '14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) ' ? ^ ^ ^^^^ ^ ^^^ - '[subcomponent: tests] in sediment by picking and ' ? ^ + '[Subcomponent: tests] in sediment by picking and ' ? ^ 'accelerator mass spectrometry'}}, ]

Check failure on line 90 in tests/contrib/subjects/bodc/test_subjects_bodc_datastream.py

View workflow job for this annotation

GitHub Actions / Python / Tests (3.12, postgresql14, opensearch2)

test_gemet_concept_transformer_pref_label AssertionError: assert [{'id': 'SDN:...DC-PUV', ...}] == [{'id': 'SDN:...DC-PUV', ...}] At index 0 diff: {'id': 'SDN:P01::SAGEMSFM', 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', 'title': {'en': '14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) [Subcomponent: tests] in sediment by picking and accelerator mass spectrometry'}, 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'identifiers': [{'scheme': 'url', 'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/'}]} != {'id': 'SDN:P01::SAGEMSFM', 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', 'title': {'en': '14c age of foraminiferida (itis: 44030: worms 22528) [subcomponent: tests] in sediment by picking and accelerator mass spectrometry'}, 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'identifiers': [{'scheme': 'url', 'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/'}]} Full diff: [ {'id': 'SDN:P01::SAGEMSFM', 'identifiers': [{'identifier': 'http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM/', 'scheme': 'url'}], 'props': {'definition': 'Accelerated mass spectrometry on picked tests'}, 'scheme': 'BODC-PUV', 'subject': 'AMSSedAge', - 'title': {'en': '14c age of foraminiferida (itis: 44030: worms 22528) ' ? ^ ^ ^^^^ ^ ^^^ + 'title': {'en': '14C age of Foraminiferida (ITIS: 44030: WoRMS 22528) ' ? ^ ^ ^^^^ ^ ^^^ - '[subcomponent: tests] in sediment by picking and ' ? ^ + '[Subcomponent: tests] in sediment by picking and ' ? ^ 'accelerator mass spectrometry'}}, ]
36 changes: 18 additions & 18 deletions tests/contrib/subjects/gemet/test_subjects_gemet_datastream.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,24 @@
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xml:base="http://www.eionet.europa.eu/gemet/">
<skos:Concept rdf:about="concept/10008">
<skos:inScheme rdf:resource="http://www.eionet.europa.eu/gemet/gemetThesaurus"/>
<skos:prefLabel xml:lang="en">Consumer product</skos:prefLabel>
<skos:prefLabel xml:lang="ar">منتج استهلاكي</skos:prefLabel>
<skos:broader rdf:resource="concept/6660"/>
<skos:memberOf rdf:resource="group/10112"/>
<skos:memberOf rdf:resource="theme/27"/>
<skos:memberOf rdf:resource="theme/34"/>
</skos:Concept>
<rdf:Description rdf:about="theme/27">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
<rdf:Description rdf:about="theme/34">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
<rdf:Description rdf:about="group/10112">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
<skos:Concept rdf:about="concept/10008">
<skos:inScheme rdf:resource="http://www.eionet.europa.eu/gemet/gemetThesaurus"/>
<skos:prefLabel xml:lang="en">Consumer product</skos:prefLabel>
<skos:prefLabel xml:lang="ar">منتج استهلاكي</skos:prefLabel>
<skos:broader rdf:resource="concept/6660"/>
<skos:memberOf rdf:resource="group/10112"/>
<skos:memberOf rdf:resource="theme/27"/>
<skos:memberOf rdf:resource="theme/34"/>
</skos:Concept>
<rdf:Description rdf:about="theme/27">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
<rdf:Description rdf:about="theme/34">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
<rdf:Description rdf:about="group/10112">
<skos:member rdf:resource="concept/10008"/>
</rdf:Description>
</rdf:RDF>""",
encoding="utf-8",
)
Expand Down

0 comments on commit f9e30ae

Please sign in to comment.