Skip to content

Commit

Permalink
added Sweet Ontology repository (with 200+ sub ontologies)
Browse files Browse the repository at this point in the history
  • Loading branch information
k00ni committed May 20, 2024
1 parent a21892c commit 24266ad
Show file tree
Hide file tree
Showing 7 changed files with 668 additions and 8 deletions.
16 changes: 13 additions & 3 deletions doc/information-on-each-data-source.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ https://archivo.dbpedia.org/list
Related script: [scripts/src/Extractor/DBpediaArchivo.php](./../scripts/src/Extractor/DBpediaArchivo.php)

**Notes:**
* Used value of "Latest Timestamp" for latest access, "2020.06.10-175249" is interpreted as "2020-06-10 00:00:00"
* Used value of "Latest Timestamp" for `modified` field, "2020.06.10-175249" is interpreted as "2020-06-10 00:00:00"

## Linked Open Vocabularies (LOV)

Expand All @@ -30,7 +30,7 @@ https://lov.linkeddata.es/dataset/lov/
Related script: [scripts/src/Extractor/LinkedOpenVocabularies.php](./../scripts/src/Extractor/LinkedOpenVocabularies.php)

**Notes:**
* Used value of `dct:modified` for latest access; because the field only contains the date, the time is always set to `00:00:00`.
* Used value of `dct:modified` for `modified` field; because the field only contains the date, the time is always set to `00:00:00`.

## Ontology Lookup Service (OLS)

Expand All @@ -40,4 +40,14 @@ Related script: [scripts/src/Extractor/OntologyLookupService.php](./../scripts/s

**Notes:**
* Warning: Currently ignoring all ontologies with no `fileLocation` field set in ontology configuration
* Field `ontology.uploaded` is used for latest access
* Field `ontology.uploaded` is used for `modified` field

## Sweet Ontology Github Repository

https://github.com/ESIPFed/sweet

Related script: [scripts/src/Extractor/SweetOntologies.php](./../scripts/src/Extractor/SweetOntologies.php)

**Notes:**
* Here we are processing a Github repository and its files
* Use latest file change date for `modified` field
226 changes: 225 additions & 1 deletion index.csv

Large diffs are not rendered by default.

224 changes: 224 additions & 0 deletions manually-maintained-metadata-about-ontologies.csv

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions scripts/bin/renew_index.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use App\Extractor\DBpediaArchivo;
use App\Extractor\LinkedOpenVocabularies;
use App\Extractor\OntologyLookupService;
use App\Extractor\SweetOntologies;
use App\TemporaryIndex;
use quickRdf\DataFactory;

Expand All @@ -24,6 +25,7 @@
LinkedOpenVocabularies::class,
OntologyLookupService::class,
BioPortal::class,
SweetOntologies::class,
] as $class) {
/** @var \App\Extractor\AbstractExtractor */
$extractor = new $class($cache, $dataFactory, $temporaryIndex);
Expand Down
174 changes: 174 additions & 0 deletions scripts/src/Extractor/SweetOntologies.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
<?php

declare(strict_types=1);

namespace App\Extractor;

use App\IndexEntry;
use Exception;
use RecursiveDirectoryIterator;
use RecursiveIteratorIterator;
use ZipArchive;

/**
* Semantic Web for Earth and Environmental Terminology (SWEET) Ontologies
*
* Reads ontology information from Githhub repository:
*
* https://github.com/ESIPFed/sweet
*/
class SweetOntologies extends AbstractExtractor
{
private string $nameOfUnzippedFolder = 'sweet-master';

protected string $namespace = 'extractor_sweet_ontologies';

/**
* Folder path to ontology list.
*/
protected string $unzippedContent = VAR_FOLDER_PATH.'sweet-master'.DIRECTORY_SEPARATOR.'src'.DIRECTORY_SEPARATOR;
private string $ontologyListUrl = 'https://github.com/ESIPFed/sweet/archive/refs/heads/master.zip';

/**
* @throws \Exception
* @throws \UnexpectedValueException
*/
public function run(): void
{
echo PHP_EOL;
echo '-------------------------------------------------';
echo PHP_EOL;
echo 'Sweet Ontologies - Extraction started ...';
echo PHP_EOL;

/*
* Approach:
*
* 1. download latest state of the repository
* 2. unzip
* 3. read content of src folder (contains all ontologies)
* 4. fill index
*/

// 1. download latest state of the repository
$downloadedFilepath = $this->cache->getCachedFilePathForFileUrl($this->ontologyListUrl);

// if available, remove folder with repository content
if (file_exists(VAR_FOLDER_PATH.$this->nameOfUnzippedFolder)) {
$this->removeFolderRec(VAR_FOLDER_PATH.$this->nameOfUnzippedFolder);
}

// 2. unzip
$this->unzip($downloadedFilepath, VAR_FOLDER_PATH);

// 3. read content of src folder to get a list of all ontology files
$path = VAR_FOLDER_PATH.$this->nameOfUnzippedFolder.DIRECTORY_SEPARATOR.'src';
$ttlFiles = $this->getTtlFileListOfFolder($path);

sort($ttlFiles);

foreach ($ttlFiles as $ttlFilepath) {
echo PHP_EOL;
echo PHP_EOL.'process: '.$ttlFilepath;
echo PHP_EOL;

$newEntry = $this->getPreparedIndexEntry();
$fileHandle = fopen($ttlFilepath, 'r');
if (false === $fileHandle) {
throw new Exception('Could not read file '.$ttlFilepath);
}

$graph = $this->loadQuadsIntoGraph($fileHandle, $ttlFilepath);

// get ontology IRI
$entries = $graph->getInstancesOfType('owl:Ontology');
if (1 == count($entries)) {
$newEntry->setOntologyIri($entries[0]);
$this->addFurtherMetadata($newEntry, $graph);
} else {
echo PHP_EOL.'IGNORED, because none or more than 1 ontologies found in '.$ttlFilepath;
continue;
}

// set title
$newEntry->setOntologyTitle($graph->getLabel((string) $newEntry->getOntologyIri()));

// build TTL file URL and save it
$newEntry->setLatestTurtleFile('https://raw.githubusercontent.com/ESIPFed/sweet/master/src/'.basename($ttlFilepath));

// set modified date
$lastChangeTimestamp = filemtime($ttlFilepath);
if (0 < $lastChangeTimestamp) {
$newEntry->setModified(date('Y-m-d', $lastChangeTimestamp));
}

// 4. fill index
$this->temporaryIndex->storeEntries([$newEntry]);
}
}

public function getPreparedIndexEntry(): IndexEntry
{
return new IndexEntry('Sweet Ontologies', $this->ontologyListUrl);
}

/**
* @return array<non-empty-string>
*
* @throws \UnexpectedValueException
*/
private function getTtlFileListOfFolder(string $folderpath): array
{
$rii = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($folderpath));
$files = [];

/** @var \SplFileInfo $file */
foreach ($rii as $file) {
if ($file->isDir()) {
continue;
}

if (str_contains($file->getPathname(), '.ttl')) {
$files[] = $file->getPathname();
}
}

return $files;
}

/**
* @param non-empty-string $fileToExtract
* @param non-empty-string $targetPath
*
* @throws \Exception
*/
private function unzip(string $fileToExtract, string $targetPath): void
{
$zip = new ZipArchive();
$res = $zip->open($fileToExtract);
if ($res === true) {
$zip->extractTo($targetPath);
$zip->close();
} else {
throw new Exception('Unzip of '.$fileToExtract.' failed!');
}
}

/**
* @throws \UnexpectedValueException
*/
private function removeFolderRec(string $folderpath): void
{
$files = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($folderpath, RecursiveDirectoryIterator::SKIP_DOTS),
RecursiveIteratorIterator::CHILD_FIRST
);

foreach ($files as $fileinfo) {
$todo = ($fileinfo->isDir() ? 'rmdir' : 'unlink');
$todo($fileinfo->getRealPath());
}

rmdir($folderpath);
}
}
25 changes: 25 additions & 0 deletions scripts/src/Graph.php
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,31 @@ public function hasSubject(string $uri): bool
return false;
}

/**
* @throws \InvalidArgumentException
*
* @return array<non-empty-string>
*/
public function getInstancesOfType(string $type): array
{
$fullTypeUri = RdfNamespace::expand($type);

$result = [];

foreach ($this->list as $quad) {
if (
$quad->getPredicate()->getValue() == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
&& $quad->getObject()->getValue() == $fullTypeUri
) {
/** @var non-empty-string */
$val = $quad->getSubject()->getValue();
$result[] = $val;
}
}

return $result;
}

/**
* @throws \InvalidArgumentException
*/
Expand Down
9 changes: 5 additions & 4 deletions scripts/var/.gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
downloaded_rdf_files/*
temp_files/*
extractor_*
phpstan_cache
/downloaded_rdf_files/*
/extractor_*
/phpstan_cache
/sweet-master
/temp_files/*
*.*
!.gitignore

0 comments on commit 24266ad

Please sign in to comment.