From 39c0e43ee5d6cfb5a2f4d7ef0439ec814cceb73c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Jan 2025 19:48:38 +0100 Subject: [PATCH] fix extraction of urls that are not well formed (supplementary-material generated by pub2tei) --- .../core/engines/DatasetDisambiguator.java | 77 +++++-------------- .../grobid/core/engines/DatasetParser.java | 9 ++- .../grobid/core/utilities/XMLUtilities.java | 4 +- 3 files changed, 29 insertions(+), 61 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index 4744b25..c1b78e7 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -1,73 +1,36 @@ package org.grobid.core.engines; -import nu.xom.Attribute; -import nu.xom.Element; +import com.fasterxml.jackson.core.io.JsonStringEncoder; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpEntity; import org.apache.http.client.config.RequestConfig; -import org.grobid.core.GrobidModels; -import org.grobid.core.data.DatasetComponent; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.conn.HttpHostConnectException; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.mime.HttpMultipartMode; +import org.apache.http.entity.mime.MultipartEntityBuilder; +import org.apache.http.entity.mime.content.StringBody; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; import org.grobid.core.data.Dataset; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.xml.XmlBuilderUtils; -import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.DatasetTaggingLabels; -import org.grobid.core.engines.label.SegmentationLabels; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorDataseer; -import org.grobid.core.layout.BoundingBox; +import org.grobid.core.data.DatasetComponent; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.lexicon.DatastetLexicon; import org.grobid.core.utilities.DatastetConfiguration; -import org.grobid.core.utilities.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.net.HttpURLConnection; +import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; - -import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.UrlEncodedFormEntity; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.message.BasicNameValuePair; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.HttpEntity; -import org.apache.http.util.EntityUtils; -import org.apache.http.entity.mime.content.StringBody; -import org.apache.http.entity.ContentType; -import org.apache.http.entity.mime.MultipartEntityBuilder; -import org.apache.http.entity.mime.HttpMultipartMode; -import org.apache.http.conn.HttpHostConnectException; -import org.apache.commons.lang3.tuple.Pair; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; +import java.util.*; /** * Dataset entity disambiguator. Once dataset mentions are recognized and grouped diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 56f7b03..a57bee6 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -550,7 +550,11 @@ private List addUrlComponentsAsReferences(DatasetDocumentSeque String target = urlInfos.getMiddle(); // String type = urlInfos.getRight(); - DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end)); + String sequenceText = sequence.getText(); + if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) { + continue; + } + DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end)); urlComponent.setOffsetStart(pos.start); urlComponent.setOffsetEnd(pos.end); if (target != null) { @@ -2005,7 +2009,8 @@ public Pair>, List> processTEIDocument(org.w3c.do biblioRefMap.put(refTextClean, biblioItem); - Integer refKey = biblioComponentWrapper.getRefKey(target); BiblioComponent biblioComponent = new BiblioComponent( + Integer refKey = biblioComponentWrapper.getRefKey(target); + BiblioComponent biblioComponent = new BiblioComponent( biblioItem, refKey ); biblioComponent.setRawForm(refText); diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index d5d5a95..63aebab 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -223,7 +223,7 @@ public static Pair>> g for (int j = 0; j < list2.getLength(); j++) { Node node2 = list2.item(j); if (node2.getNodeType() == Node.TEXT_NODE) { - String chunk = node2.getNodeValue(); + String chunk = normalize(node2.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length(); @@ -231,7 +231,7 @@ public static Pair>> g } } } else if (node.getNodeType() == Node.TEXT_NODE) { - String chunk = node.getNodeValue(); + String chunk = normalize(node.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length();