Skip to content

Commit

Permalink
fix extraction of urls that are not well formed (supplementary-materi…
Browse files Browse the repository at this point in the history
…al generated by pub2tei)
  • Loading branch information
lfoppiano committed Jan 2, 2025
1 parent 54bc62a commit 39c0e43
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 61 deletions.
77 changes: 20 additions & 57 deletions src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
Original file line number Diff line number Diff line change
@@ -1,73 +1,36 @@
package org.grobid.core.engines;

import nu.xom.Attribute;
import nu.xom.Element;
import com.fasterxml.jackson.core.io.JsonStringEncoder;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.DatasetComponent;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.HttpHostConnectException;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.grobid.core.data.Dataset;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.engines.label.DatasetTaggingLabels;
import org.grobid.core.engines.label.SegmentationLabels;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.features.FeaturesVectorDataseer;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.data.DatasetComponent;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.lexicon.DatastetLexicon;
import org.grobid.core.utilities.DatastetConfiguration;
import org.grobid.core.utilities.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.databind.*;
import com.fasterxml.jackson.databind.node.*;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.core.io.*;

import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;

import java.net.HttpURLConnection;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.HttpEntity;
import org.apache.http.util.EntityUtils;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.conn.HttpHostConnectException;
import org.apache.commons.lang3.tuple.Pair;

import static org.apache.commons.lang3.StringUtils.*;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import java.util.*;

/**
* Dataset entity disambiguator. Once dataset mentions are recognized and grouped
Expand Down
9 changes: 7 additions & 2 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,11 @@ private List<DatasetComponent> addUrlComponentsAsReferences(DatasetDocumentSeque
String target = urlInfos.getMiddle();
// String type = urlInfos.getRight();

DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end));
String sequenceText = sequence.getText();
if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) {
continue;
}
DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end));
urlComponent.setOffsetStart(pos.start);
urlComponent.setOffsetEnd(pos.end);
if (target != null) {
Expand Down Expand Up @@ -2005,7 +2009,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do

biblioRefMap.put(refTextClean, biblioItem);

Integer refKey = biblioComponentWrapper.getRefKey(target); BiblioComponent biblioComponent = new BiblioComponent(
Integer refKey = biblioComponentWrapper.getRefKey(target);
BiblioComponent biblioComponent = new BiblioComponent(
biblioItem, refKey
);
biblioComponent.setRawForm(refText);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/grobid/core/utilities/XMLUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,15 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
for (int j = 0; j < list2.getLength(); j++) {
Node node2 = list2.item(j);
if (node2.getNodeType() == Node.TEXT_NODE) {
String chunk = node2.getNodeValue();
String chunk = normalize(node2.getNodeValue());
buf.append(chunk);
found = true;
indexPos += chunk.length();
}
}
}
} else if (node.getNodeType() == Node.TEXT_NODE) {
String chunk = node.getNodeValue();
String chunk = normalize(node.getNodeValue());
buf.append(chunk);
found = true;
indexPos += chunk.length();
Expand Down

0 comments on commit 39c0e43

Please sign in to comment.