From 5433b9f80df3e2f0e908d060718f875b340b4519 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Champin Date: Tue, 24 Oct 2023 12:25:35 +0200 Subject: [PATCH] add a few examples for sophia --- Cargo.toml | 1 + jsonld/Cargo.toml | 2 +- sophia/Cargo.toml | 3 + sophia/examples/canonicalize.rs | 46 ++++++++++ sophia/examples/parse.rs | 150 ++++++++++++++++++++++++++++++++ sophia/examples/serialize.rs | 95 ++++++++++++++++++++ 6 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 sophia/examples/canonicalize.rs create mode 100644 sophia/examples/parse.rs create mode 100644 sophia/examples/serialize.rs diff --git a/Cargo.toml b/Cargo.toml index a6c54e8d..c586073f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ rio_xml = { version = "0.8" } test-case = "3.1.0" thiserror = "1.0.32" tokio = { version="1.33.0", features = ["rt"] } +url = "2.4.1" [profile.release] lto = true diff --git a/jsonld/Cargo.toml b/jsonld/Cargo.toml index 05a94d58..b4c1fb05 100644 --- a/jsonld/Cargo.toml +++ b/jsonld/Cargo.toml @@ -30,7 +30,7 @@ sophia_iri.workspace = true sophia_term.workspace = true thiserror.workspace = true tokio.workspace = true -url = "2.4.1" +url.workspace = true [dev-dependencies] sophia_turtle.workspace = true diff --git a/sophia/Cargo.toml b/sophia/Cargo.toml index 1149bb89..0b465495 100644 --- a/sophia/Cargo.toml +++ b/sophia/Cargo.toml @@ -36,3 +36,6 @@ sophia_rio.workspace = true sophia_turtle.workspace = true sophia_term.workspace = true sophia_xml = { workspace = true, optional = true } + +[dev-dependencies] +url.workspace = true \ No newline at end of file diff --git a/sophia/examples/canonicalize.rs b/sophia/examples/canonicalize.rs new file mode 100644 index 00000000..3b0bfb99 --- /dev/null +++ b/sophia/examples/canonicalize.rs @@ -0,0 +1,46 @@ +//! Read a Dataset serialized in [N-Quads] from the standart input, +//! and write back to the standatd output its canonical form, +//! using the [RDFC-1.0] canonicalization algorithm. +//! +//! Parameters of the RDFC-1.0 can be provided via the following environment variables: +//! * SOPHIA_RDFC10_DEPTH_FACTOR +//! * SOPHIA_RDFC10_PERMUTATION_LIMIT +//! +//! [N-Quads]: https://www.w3.org/TR/n-quads/ +//! [RDFC-1.0]: https://www.w3.org/TR/rdf-canon/ + +use std::env::{var, VarError::*}; +use std::io::{stdin, stdout, BufReader, BufWriter}; + +use sophia::api::prelude::*; +use sophia::api::quad::Spog; +use sophia::api::term::SimpleTerm; +use sophia::c14n::rdfc10; +use sophia::turtle::parser::nq; +use sophia_c14n::hash::Sha256; +use sophia_c14n::rdfc10::{DEFAULT_DEPTH_FACTOR, DEFAULT_PERMUTATION_LIMIT}; + +fn main() -> Result<(), Box> { + let input = BufReader::new(stdin()); + let dataset: MyDataset = nq::parse_bufread(input).collect_quads()?; + let output = BufWriter::new(stdout()); + let depth_factor = match var("SOPHIA_RDFC10_DEPTH_FACTOR") { + Ok(txt) => txt + .parse() + .expect("SOPHIA_RDFC10_DEPTH_FACTOR is not a valid f32"), + Err(NotPresent) => DEFAULT_DEPTH_FACTOR, + Err(other) => return Err(other.into()), + }; + let permutation_limit = match var("SOPHIA_RDFC10_PERMUTATION_LIMIT") { + Ok(txt) => txt + .parse() + .expect("SOPHIA_RDFC10_PERMUTATION_LIMIT is not a valid usize"), + Err(NotPresent) => DEFAULT_PERMUTATION_LIMIT, + Err(other) => return Err(other.into()), + }; + // TODO make it possible to select another hash function + rdfc10::normalize_with::(&dataset, output, depth_factor, permutation_limit)?; + Ok(()) +} + +type MyDataset = std::collections::HashSet>>; diff --git a/sophia/examples/parse.rs b/sophia/examples/parse.rs new file mode 100644 index 00000000..70f7faaf --- /dev/null +++ b/sophia/examples/parse.rs @@ -0,0 +1,150 @@ +//! Parse a graph or a dataset from the standard input, +//! in the format specified in the first argument, +//! and write it back in [N-Triples]/[N-Quads] to the standard output. +//! +//! Alternatively, the input file name can be provided as a second argument, +//! which will also set the base IRI to the corresponding file: URL. +//! +//! The base IRI can be overridden via the environment variable SOPHIA_BASE. +//! +//! Recognized formats are: +//! - [`ntriples`](https://www.w3.org/TR/n-triples/) (alias `nt`) +//! - [`turtle`](https://www.w3.org/TR/turtle/) (alias `ttl`) +//! - [`nquads`](https://www.w3.org/TR/n-quads/) (alias `nq`) +//! - [`trig`](https://www.w3.org/TR/trig/) +//! - `gnq` (Generalized [N-Quads](https://www.w3.org/TR/n-quads/)) +//! - `gtrig` (Generalized [TriG](https://www.w3.org/TR/trig/), default) +//! - [`jsonld`](https://www.w3.org/TR/json-ld11) (if compiled witht the `jsonld` feature) +//! - [`rdfxml`](https://www.w3.org/TR/rdf-syntax-grammar) (if compiled witht the `xml` feature, alias `rdf`) +//! +//! [N-Triples]: https://www.w3.org/TR/n-triples/ +//! [N-Quads]: https://www.w3.org/TR/n-quads/ + +use std::fs::File; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdin}; + +use sophia::api::prelude::*; +use sophia::api::source::StreamError::{SinkError, SourceError}; +#[cfg(feature = "jsonld")] +use sophia::jsonld::{JsonLdOptions, JsonLdParser}; +use sophia::turtle::parser::{ + gnq::GNQuadsParser, gtrig::GTriGParser, nq::NQuadsParser, nt::NTriplesParser, trig::TriGParser, + turtle::TurtleParser, +}; +use sophia::turtle::serializer::{nq::NqSerializer, nt::NtSerializer}; +#[cfg(feature = "xml")] +use sophia::xml::parser::RdfXmlParser; + +fn main() { + let format = std::env::args() + .nth(1) + .unwrap_or_else(|| "gtrig".to_string()); + let path = std::env::args().nth(2); + let base = Some(if let Some(iri) = std::env::var_os("SOPHIA_BASE") { + let iri = iri + .into_string() + .expect("Invalid UTF-8 data in SOPHIA_BASE"); + Iri::new(iri).expect("Invalid IRI in SOPHIA_BASE") + } else if let Some(path) = &path { + let cwd = std::env::current_dir().expect("No current directory"); + let url = url::Url::from_file_path(cwd.join(path)).expect("Invalid path"); + Iri::new(url.into()).expect("Invalid file: IRI") + } else { + Iri::new_unchecked("x-stdin://localhost/".into()) + }); + let input = Input::new(path); + let res = match &format[..] { + "ntriples" | "nt" => dump_triples(input, NTriplesParser {}), + "turtle" | "ttl" => dump_triples(input, TurtleParser { base }), + "nquads" | "nq" => dump_quads(input, NQuadsParser {}), + "trig" => dump_quads(input, TriGParser { base }), + "gnq" => dump_quads(input, GNQuadsParser {}), + "gtrig" => dump_quads(input, GTriGParser { base }), + #[cfg(feature = "jsonld")] + "json-ld" | "jsonld" => { + let options = JsonLdOptions::new() + .with_base(base.clone().unwrap().map_unchecked(std::sync::Arc::from)); + let loader: sophia::jsonld::loader::FileUrlLoader = Default::default(); + #[cfg(feature = "http_client")] + let loader = sophia::jsonld::loader::ChainLoader::new( + loader, + sophia::jsonld::loader::HttpLoader::default(), + ); + let options = options.with_document_loader(loader); + dump_quads(input, JsonLdParser::new_with_options(options)) + } + #[cfg(feature = "xml")] + "rdfxml" | "rdf" => dump_triples(input, RdfXmlParser { base }), + _ => { + eprintln!("Unrecognized format: {}", format); + std::process::exit(-1); + } + }; + if let Err(msg) = res { + eprintln!("{}", msg); + std::process::exit(1); + } +} + +fn dump_triples>(input: Input, p: P) -> Result<(), String> { + let triple_source = p.parse(input); + + let output = BufWriter::new(stdout()); + let mut ser = NtSerializer::new(output); + match ser.serialize_triples(triple_source) { + Ok(_) => Ok(()), + Err(SourceError(e)) => Err(format!("Error while parsing input: {}", e)), + Err(SinkError(e)) => Err(format!("Error while writing quads: {}", e)), + } +} + +fn dump_quads>(input: Input, p: P) -> Result<(), String> { + let quad_source = p.parse(input); + + let output = BufWriter::new(stdout()); + let mut ser = NqSerializer::new(output); + match ser.serialize_quads(quad_source) { + Ok(_) => Ok(()), + Err(SourceError(e)) => Err(format!("Error while parsing input: {}", e)), + Err(SinkError(e)) => Err(format!("Error while writing quads: {}", e)), + } +} + +enum Input { + Stdin(BufReader), + File(BufReader), +} + +impl Input { + fn new(path: Option) -> Self { + match path { + None => Self::Stdin(BufReader::new(stdin())), + Some(path) => Self::File(BufReader::new(File::open(path).expect("Can not open file"))), + } + } +} + +impl Read for Input { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + match self { + Input::Stdin(b) => b.read(buf), + Input::File(b) => b.read(buf), + } + } +} + +impl BufRead for Input { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + match self { + Input::Stdin(b) => b.fill_buf(), + Input::File(b) => b.fill_buf(), + } + } + + fn consume(&mut self, amt: usize) { + match self { + Input::Stdin(b) => b.consume(amt), + Input::File(b) => b.consume(amt), + } + } +} diff --git a/sophia/examples/serialize.rs b/sophia/examples/serialize.rs new file mode 100644 index 00000000..81497678 --- /dev/null +++ b/sophia/examples/serialize.rs @@ -0,0 +1,95 @@ +//! Read a graph or a dataset from the standard input in [N-Triples]/[N-Quads], +//! and serialize it back to the format specifed in the first argument. +//! +//! Recognized formats are: +//! - [`ntriples`](https://www.w3.org/TR/n-triples/) (alias `nt`) +//! - [`turtle`](https://www.w3.org/TR/turtle/) (alias `ttl`) +//! - [`nquads`](https://www.w3.org/TR/n-quads/) (alias `nq`) +//! - [`trig`](https://www.w3.org/TR/trig/) +//! - [`jsonld`](https://www.w3.org/TR/json-ld11) (if compiled witht the `jsonld` feature) +//! - [`rdfxml`](https://www.w3.org/TR/rdf-syntax-grammar) (if compiled witht the `xml` feature, alias `rdf`) +//! +//! NB: if the input is a dataset with named graphs, +//! and the ouput format is a graph format, +//! then only the default graph is serialized. +//! +//! [N-Triples]: https://www.w3.org/TR/n-triples/ +//! [N-Quads]: https://www.w3.org/TR/n-quads/ + +use std::io::{stdin, stdout, BufReader, BufWriter}; + +use sophia::api::prelude::*; +use sophia::api::source::StreamError::{SinkError, SourceError}; +#[cfg(feature = "jsonld")] +use sophia::jsonld::{serializer::JsonLdSerializer, JsonLdOptions}; +use sophia::turtle::parser::gnq; +use sophia::turtle::serializer::{ + nq::NqSerializer, + nt::NtSerializer, + trig::{TrigConfig, TrigSerializer}, + turtle::{TurtleConfig, TurtleSerializer}, +}; +#[cfg(feature = "xml")] +use sophia::xml::serializer::RdfXmlSerializer; + +fn main() { + let input = BufReader::new(stdin()); + let quad_source = gnq::parse_bufread(input); + let out = BufWriter::new(stdout()); + + let format = std::env::args() + .nth(1) + .unwrap_or_else(|| "trig".to_string()); + let res = match &format[..] { + "ntriples" | "nt" => serialize_triples(quad_source, NtSerializer::new(out)), + "turtle" | "ttl" => { + let config = TurtleConfig::new().with_pretty(true); + let ser = TurtleSerializer::new_with_config(out, config); + serialize_triples(quad_source, ser) + } + "nquads" | "nq" => serialize_quads(quad_source, NqSerializer::new(out)), + "trig" => { + let config = TrigConfig::new().with_pretty(true); + let ser = TrigSerializer::new_with_config(out, config); + serialize_quads(quad_source, ser) + } + #[cfg(feature = "jsonld")] + "json-ld" | "jsonld" => serialize_quads( + quad_source, + JsonLdSerializer::new_with_options(out, JsonLdOptions::new().with_spaces(2)), + ), + #[cfg(feature = "xml")] + "rdfxml" | "rdf" => serialize_triples(quad_source, RdfXmlSerializer::new(out)), + _ => { + eprintln!("Unrecognized format: {}", format); + std::process::exit(-1); + } + }; + if let Err(msg) = res { + eprintln!("{}", msg); + std::process::exit(1); + } +} + +fn serialize_triples( + quad_source: Q, + mut ser: S, +) -> Result<(), String> { + let triple_source = quad_source.filter_quads(|q| q.g().is_none()).to_triples(); + match ser.serialize_triples(triple_source) { + Ok(_) => Ok(()), + Err(SourceError(e)) => Err(format!("Error while parsing input: {}", e)), + Err(SinkError(e)) => Err(format!("Error while serializing triples: {}", e)), + } +} + +fn serialize_quads( + quad_source: Q, + mut ser: S, +) -> Result<(), String> { + match ser.serialize_quads(quad_source) { + Ok(_) => Ok(()), + Err(SourceError(e)) => Err(format!("Error while parsing input: {}", e)), + Err(SinkError(e)) => Err(format!("Error while serializing quads: {}", e)), + } +}