Merge branch 'develop' into feature/ontology-link

dice-group · Sep 20, 2024 · f664e17 · f664e17
2 parents bcdc7b4 + 033ef0b
commit f664e17
Show file tree

Hide file tree

Showing 48 changed files with 1,980 additions and 394 deletions.
diff --git a/docs/configuration/queries.md b/docs/configuration/queries.md
@@ -16,6 +16,7 @@ The `queries` property is an object that contains the following properties:
 | order     | no       | `linear`       | The order in which the queries are executed. If set to `linear` the queries will be executed in their order inside the file. If `format` is set to `folder`, queries will be sorted by their file name first. | `random` or `linear`                      |
 | seed      | no       | `0`            | The seed for the random number generator that selects the queries. If multiple workers use the same query handler, their seed will be the sum of the given seed and their worker id.                          | `12345`                                   |
 | lang      | no       | `SPARQL`       | Not used for anything at the moment.                                                                                                                                                                          |                                           |
+| template  | no       |                | If set, queries from `path` will be treated as query templates. See [Query Templates](#query-templates) for more information.                                                                                 |                                           |
 
 ## Format
 
@@ -77,6 +78,11 @@ WHERE {
 ```
 The `separator` property should be set to `"\n###\n"`. (be aware of different line endings on different operating systems)
 
+## Huge Query Strings
+When working with large queries (Queries that are larger than 2³¹ Bytes or ~2GB),
+it is important to consider that only the request types `post query` and `update query`
+support large queries.
+
 ## Example
 ```yaml
 tasks:
@@ -93,3 +99,54 @@ tasks:
         lang: "SPARQL"
       # ... additional worker properties
 ```
+
+## Query Templates
+Query templates are queries containing placeholders for some terms. 
+Replacement candidates are identified by querying a given endpoint. 
+This is done in a way that the resulting queries will yield results against endpoints with the same data.
+
+The placeholders are written in the form of `%%[a-zA-Z0-9_]+%%`, which means that any character sequence consisting 
+of letters, numbers, and underscores, enclosed by `%%` will be interpreted as a placeholder.
+The query templates originated from WatDiv, 
+where the placeholders are of [similar form](https://dsg.uwaterloo.ca/watdiv/basic-testing.shtml).
+If the placeholder name is equal to a variable name in the query, the placeholder will not be assigned
+the same variable name during candidate generation.
+
+Query templates and normal queries can be mixed in the same file or folder.
+
+An exemplary template:
+`SELECT * WHERE {?s %%var1%% ?o . ?o <http://exa.com> %%var2%%}`
+
+This template will then be converted to:
+`SELECT ?var1 ?var2 WHERE {?s ?var1 ?o . ?o <http://exa.com> ?var2}`
+
+The SELECT query will then be requested from the given sparql endpoint (e.g DBpedia).
+The solutions for this query are used to instantiate the template.
+The results may look like the following:
+- `SELECT * WHERE {?s <http://prop/1> ?o . ?o <http://exa.com> "123"}`
+- `SELECT * WHERE {?s <http://prop/1> ?o . ?o <http://exa.com> "12"}`
+- `SELECT * WHERE {?s <http://prop/2> ?o . ?o <http://exa.com> "1234"}`
+
+### Configuration
+The `template` attribute has the following properties:
+
+| property | required | default | description                                                         | example                     |
+|----------|----------|---------|---------------------------------------------------------------------|-----------------------------|
+| endpoint | yes      |         | The endpoint to query.                                              | `http://dbpedia.org/sparql` |
+| limit    | no       | `2000`  | The maximum number of instances per query template.                 | `100`                       |
+| save     | no       | `true`  | If set to `true`, query instances will be saved in a separate file. | `false`                     |
+
+If the `save` attribute is set to `true`,
+the instances will be saved in a separate file in the same directory as the query templates.
+If the query templates are stored in a folder, the instances will be saved in the parent directory.
+
+Example of query configuration with query templates:
+```yaml
+queries:
+  path: "./example/suite/queries/"
+  format: "folder" 
+  template:
+    endpoint: "http://dbpedia.org/sparql"
+    limit: 100
+    save: true
+```
diff --git a/docs/configuration/response_body_processor.md b/docs/configuration/response_body_processor.md
@@ -6,9 +6,20 @@ The processing is done to extract relevant information from the responses and st
 
 Iguana supports multiple response body processors that are defined by the content type of the response body they process.
 
-Currently only the `application/sparql-results+json` content type is supported, 
-and it only uses the `SaxSparqlJsonResultCountingParser` language processor 
-to extract simple information from the responses.
+The following content types are supported:
+- `application/sparql-results+json`
+- `application/sparql-results+xml`
+- `text/csv`
+- `text/tab-separated-values`
+
+
+For the `json` and `xml` content types, 
+the response body processor counts for `SELECT` queries 
+the number of results and bindings and lists all variables and link attributes.
+If the requested query was a `ASK` query, the response body processor stores the boolean result.
+
+For the `csv` and `tsv` content types, only `SELECT` queries are supported.
+The response body processor counts the number of results and bindings and lists all variables.
 
 Workers send the response bodies to the response body processors, 
 after receiving the full response bodies from the HTTP requests.

diff --git a/example-suite.yml b/example-suite.yml
@@ -74,7 +74,11 @@ tasks:
         number: 16
         requestType: post query
         queries:
-          path: "./example/queries.txt"
+          path: "./example/query_pattern.txt"
+          pattern:
+            endpoint: "https://dbpedia.org/sparql"
+            limit: 1000
+            save: false
         timeout: 180s
         completionTarget:
           duration: 1000s

diff --git a/graalvm/generate-profile.sh b/graalvm/generate-profile.sh
@@ -40,21 +40,21 @@ if [ -z "$SUITE" ]; then
 fi
 
 # Instrument the application
-"$GRAALVM_HOME"/bin/native-image --pgo-instrument "$ARGUMENTS" -jar ./target/iguana.jar -o "./target/iguana-4.0.0-instrumented"
+"$GRAALVM_HOME"/bin/native-image --pgo-instrument "$ARGUMENTS" -jar ./target/iguana.jar -o "./target/iguana-4.1.0-instrumented"
 if [ $? -ne 0 ]; then
   echo "Error while instrumenting the application."
   exit 1
 fi
 
 # Generate the profile
-./target/iguana-4.0.0-instrumented -XX:ProfilesDumpFile=custom.iprof "$SUITE"
+./target/iguana-4.1.0-instrumented -XX:ProfilesDumpFile=custom.iprof "$SUITE"
 if [ $? -ne 0 ]; then
   echo "Error while generating the profile."
   exit 1
 fi
 
 # Compile the application with the profile
-"$GRAALVM_HOME"/bin/native-image --pgo=custom.iprof "$ARGUMENTS" -jar ./target/iguana.jar -o "./target/iguana-4.0.0-pgo"
+"$GRAALVM_HOME"/bin/native-image --pgo=custom.iprof "$ARGUMENTS" -jar ./target/iguana.jar -o "./target/iguana-4.1.0-pgo"
 if [ $? -ne 0 ]; then
   echo "Error while compiling the application."
   exit 1

diff --git a/graalvm/queries.txt b/graalvm/queries.txt
@@ -1 +1 @@
-placeholder
+SELECT * WHERE {?s %%var1%% ?o . ?o %%var3%% %%var2%%}
diff --git a/graalvm/suite.yml b/graalvm/suite.yml
@@ -59,6 +59,10 @@ tasks:
           order: "random"
           seed: 123
           lang: "SPARQL"
+          template:
+            endpoint: "http://dbpedia.org/sparql"
+            limit: 1
+            save: false
         timeout: 2s
         connection: Blazegraph
         completionTarget:

diff --git a/pom.xml b/pom.xml
@@ -37,7 +37,7 @@
         <revision>${major.minor.version}.${build.version}</revision>
         <major.minor.version>${major.version}.${minor.version}</major.minor.version>
         <major.version>4</major.version>
-        <minor.version>0</minor.version>
+        <minor.version>1</minor.version>
         <build.version>0</build.version>
 
         <java.version>17</java.version>
@@ -315,6 +315,8 @@
                                 -O3
                                 -H:-UseCompressedReferences
                                 -H:+UnlockExperimentalVMOptions
+                                --enable-http
+                                --enable-https
                             </buildArgs>
                             <metadataRepository>
                                 <enabled>true</enabled>

diff --git a/schema/iguana-schema.json b/schema/iguana-schema.json
@@ -183,8 +183,8 @@
         }
       },
       "required": [
-          "type",
-          "directory"
+        "type",
+        "directory"
       ],
       "title": "CSVStorage"
     },
@@ -335,9 +335,29 @@
       "type": "object",
       "unevaluatedProperties": false,
       "required": [
-          "duration"
+        "duration"
       ]
     },
+    "Template": {
+      "type": "object",
+      "additionalProperties": false,
+      "properties": {
+        "endpoint": {
+          "type": "string"
+        },
+        "limit": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "save": {
+          "type": "boolean"
+        }
+      },
+      "required": [
+        "endpoint"
+      ],
+      "title": "Template"
+    },
     "QueryMixes": {
       "properties": {
         "number": {
@@ -379,6 +399,9 @@
         "lang": {
           "type": "string",
           "enum": [ "", "SPARQL" ]
+        },
+        "template": {
+          "$ref": "#/definitions/Template"
         }
       },
       "required": [

diff --git a/schema/iguana.owx b/schema/iguana.owx
@@ -5,8 +5,8 @@
      xmlns:xml="http://www.w3.org/XML/1998/namespace"
      xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
      xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
-     ontologyIRI="https://vocab.dice-research.org/iguana/"
-     versionIRI="https://vocab.dice-research.org/iguana/4.0.0/">
+     ontologyIRI="https://iguana-benchmark.eu/ontology/"
+     versionIRI="https://iguana-benchmark.eu/ontology/4.1.0/">
     <Prefix name="dc" IRI="http://purl.org/dc/elements/1.1/"/>
     <Prefix name="owl" IRI="http://www.w3.org/2002/07/owl#"/>
     <Prefix name="rdf" IRI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
@@ -48,7 +48,7 @@
     </Annotation>
     <Annotation>
         <AnnotationProperty abbreviatedIRI="owl:versionInfo"/>
-        <Literal>4.0.0</Literal>
+        <Literal>4.1.0</Literal>
     </Annotation>
     <Declaration>
         <Class abbreviatedIRI="iont:Connection"/>

diff --git a/src/main/java/org/aksw/iguana/cc/lang/LanguageProcessor.java b/src/main/java/org/aksw/iguana/cc/lang/LanguageProcessor.java
@@ -33,7 +33,7 @@ public abstract class LanguageProcessor {
 
     public interface LanguageProcessingData extends Storable {
         long hash();
-        Class<? extends LanguageProcessor> processor();
+        Exception exception();
     }
 
     public abstract LanguageProcessingData process(InputStream inputStream, long hash);
@@ -45,6 +45,9 @@ public interface LanguageProcessingData extends Storable {
     // Register all available LanguageProcessors here.
     static {
         processors.put("application/sparql-results+json", org.aksw.iguana.cc.lang.impl.SaxSparqlJsonResultCountingParser.class);
+        processors.put("application/sparql-results+xml", org.aksw.iguana.cc.lang.impl.SaxSparqlXmlResultCountingParser.class);
+        processors.put("text/tab-separated-values", org.aksw.iguana.cc.lang.impl.SparqlTsvResultCountingParser.class);
+        processors.put("text/csv", org.aksw.iguana.cc.lang.impl.SparqlCsvResultCountingParser.class);
     }
 
     public static LanguageProcessor getInstance(String contentType) {

diff --git a/src/main/java/org/aksw/iguana/cc/lang/impl/BooleanResultData.java b/src/main/java/org/aksw/iguana/cc/lang/impl/BooleanResultData.java
@@ -0,0 +1,60 @@
+package org.aksw.iguana.cc.lang.impl;
+
+import org.aksw.iguana.cc.lang.LanguageProcessor;
+import org.aksw.iguana.cc.storage.Storable;
+import org.aksw.iguana.commons.rdf.IPROP;
+import org.aksw.iguana.commons.rdf.IRES;
+import org.apache.jena.rdf.model.Model;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.ResourceFactory;
+
+import java.util.List;
+
+public record BooleanResultData(
+        long hash,
+        Boolean result,
+        List<String> links,
+        Exception exception
+        ) implements LanguageProcessor.LanguageProcessingData, Storable.AsCSV, Storable.AsRDF {
+    final static String[] header = new String[]{ "responseBodyHash", "boolean", "links", "exception" };
+
+    @Override
+    public Storable.CSVData toCSV() {
+        String resultString = "";
+        String exceptionString = "";
+        String linksString = "";
+        if (result != null)
+            resultString = result.toString();
+        if (exception != null)
+            exceptionString = exception().toString();
+        if (links != null)
+            linksString = String.join("; ", links);
+
+        String[] content = new String[]{ String.valueOf(hash), resultString, linksString, exceptionString };
+        String[][] data = new String[][]{ header, content };
+
+        String folderName = "sparql-ask-result-data";
+        List<CSVData.CSVFileData> files = List.of(new Storable.CSVData.CSVFileData("sparql-ask-result.csv", data));
+        return new Storable.CSVData(folderName, files);
+    }
+
+    @Override
+    public Model toRDF() {
+        Model m = ModelFactory.createDefaultModel();
+        Resource responseBodyRes = IRES.getResponsebodyResource(this.hash);
+        if (this.result != null) {
+            m.add(responseBodyRes, IPROP.askBoolean, ResourceFactory.createTypedLiteral(this.result));
+        }
+        if (this.links != null) {
+            for (String link : this.links) {
+                m.add(responseBodyRes, IPROP.link, ResourceFactory.createTypedLiteral(link));
+            }
+        }
+        if (this.exception != null) {
+            m.add(responseBodyRes, IPROP.exception, ResourceFactory.createTypedLiteral(this.exception.toString()));
+        }
+
+        return m;
+    }
+}
diff --git a/src/main/java/org/aksw/iguana/cc/lang/impl/ResultCountData.java b/src/main/java/org/aksw/iguana/cc/lang/impl/ResultCountData.java
@@ -0,0 +1,67 @@
+package org.aksw.iguana.cc.lang.impl;
+
+import org.aksw.iguana.cc.lang.LanguageProcessor;
+import org.aksw.iguana.cc.storage.Storable;
+import org.aksw.iguana.commons.rdf.IPROP;
+import org.aksw.iguana.commons.rdf.IRES;
+import org.apache.jena.rdf.model.Model;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.ResourceFactory;
+
+import java.util.List;
+
+public record ResultCountData (
+    long hash,
+    long results,
+    long bindings,
+    List<String> variables,
+    List<String> links,
+    Exception exception
+) implements LanguageProcessor.LanguageProcessingData, Storable.AsCSV, Storable.AsRDF {
+    final static String[] header = new String[]{ "responseBodyHash", "results", "bindings", "variables", "links", "exception" };
+
+    @Override
+    public Storable.CSVData toCSV() {
+        String variablesString = "";
+        String exceptionString = "";
+        String linksString = "";
+        if (variables != null)
+            variablesString = String.join("; ", variables);
+        if (exception != null)
+            exceptionString = exception().toString();
+        if (links != null)
+            linksString = String.join("; ", links);
+
+        String[] content = new String[]{ String.valueOf(hash), String.valueOf(results), String.valueOf(bindings), variablesString, linksString, exceptionString };
+        String[][] data = new String[][]{ header, content };
+
+        String folderName = "result-count-data";
+        List<Storable.CSVData.CSVFileData> files = List.of(new Storable.CSVData.CSVFileData("result-count.csv", data));
+        return new Storable.CSVData(folderName, files);
+    }
+
+    @Override
+    public Model toRDF() {
+        Model m = ModelFactory.createDefaultModel();
+        Resource responseBodyRes = IRES.getResponsebodyResource(this.hash);
+        m.add(responseBodyRes, IPROP.results, ResourceFactory.createTypedLiteral(this.results))
+                .add(responseBodyRes, IPROP.bindings, ResourceFactory.createTypedLiteral(this.bindings));
+
+        if (this.variables != null) {
+            for (String variable : this.variables) {
+                m.add(responseBodyRes, IPROP.variable, ResourceFactory.createTypedLiteral(variable));
+            }
+        }
+        if (this.links != null) {
+            for (String link : this.links) {
+                m.add(responseBodyRes, IPROP.link, ResourceFactory.createTypedLiteral(link));
+            }
+        }
+        if (this.exception != null) {
+            m.add(responseBodyRes, IPROP.exception, ResourceFactory.createTypedLiteral(this.exception.toString()));
+        }
+
+        return m;
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		placeholder
		SELECT * WHERE {?s %%var1%% ?o . ?o %%var3%% %%var2%%}