Skip to content

Commit

Permalink
cleanup API
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Oct 16, 2024
1 parent 127fbc2 commit b54c567
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 54 deletions.
1 change: 0 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ dependencies {
implementation "io.dropwizard.metrics:metrics-core:4.0.0"
implementation "io.dropwizard.metrics:metrics-servlets:4.0.0"

//Parsing xml/json
//Parsing xml/json
implementation group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: '2.10.1'
implementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.10.1'
Expand Down
51 changes: 23 additions & 28 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ public List<Dataset> processingString(String input, boolean disambiguate) {
}

private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSentences) {
// pre-process classification of every sentences in batch
// pre-process classification of every sentence in batch
if (this.dataseerClassifier == null)
dataseerClassifier = DataseerClassifier.getInstance();

Expand Down Expand Up @@ -629,8 +629,8 @@ private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSen
String localSentence = classificationNode.get("text").textValue();
// the following should never happen
if (!localSentence.equals(allSentences.get(totalClassificationNodes))) {
System.out.println("sentence, got: " + localSentence);
System.out.println("\texpecting: " + allSentences.get(totalClassificationNodes));
LOGGER.warn("sentence, got: " + localSentence);
LOGGER.warn("\texpecting: " + allSentences.get(totalClassificationNodes));
}
} else if (!field.equals("no_dataset")) {
scoresPerDatatypes.put(field, classificationNode.get(field).doubleValue());
Expand Down Expand Up @@ -658,7 +658,7 @@ private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSen
}

} catch (Exception e) {
e.printStackTrace();
LOGGER.error("General exception occurred during the classification with the DataSeer classifier", e);
}

return results;
Expand Down Expand Up @@ -1465,7 +1465,7 @@ public List<List<Dataset>> markDAS(List<List<Dataset>> entities, List<LayoutToke
return entities;
}

public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException {
public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
try {
String tei = processXML(file);
Expand All @@ -1480,15 +1480,15 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean
// TODO: call pub2TEI with sentence segmentation

// It's likely that JATS don't have sentences
resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext);
resultExtraction = processTEIDocument(document, disambiguate);
} catch (final Exception exp) {
LOGGER.error("An error occured while processing the following XML file: "
+ file.getPath(), exp);
}
return resultExtraction;
}

public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException {
public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Expand All @@ -1498,7 +1498,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
org.w3c.dom.Element root = document.getDocumentElement();
if (segmentSentences)
segment(document, root);
resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext);
resultExtraction = processTEIDocument(document, disambiguate);
//tei = restoreDomParserAttributeBug(tei);

} catch (final Exception exp) {
Expand Down Expand Up @@ -1529,7 +1529,6 @@ public String processXML(File file) throws Exception {

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
tei = FileUtils.readFileToString(new File(newFilePath), UTF_8);

} catch (final Exception exp) {
Expand All @@ -1550,8 +1549,7 @@ public String processXML(File file) throws Exception {
*/
public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String documentAsString,
boolean segmentSentences,
boolean disambiguate,
boolean addParagraphContext) {
boolean disambiguate) {

Pair<List<List<Dataset>>, List<BibDataSet>> tei = null;
try {
Expand All @@ -1564,12 +1562,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
if (segmentSentences)
segment(document, root);

tei = processTEIDocument(document, disambiguate, addParagraphContext);
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
tei = processTEIDocument(document, disambiguate);
} catch (ParserConfigurationException | IOException | SAXException e) {
e.printStackTrace();
}
return tei;
Expand All @@ -1582,8 +1576,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
* LF: This method attempt to reproduce the extraction from PDF in processPDF but with an already extracted TEI as input
*/
public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.dom.Document doc,
boolean disambiguate,
boolean addParagraphContext) {
boolean disambiguate) {

List<DatasetDocumentSequence> selectedSequences = new ArrayList<>();

Expand Down Expand Up @@ -2080,6 +2073,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
for (int i = 0; i < selectedSequences.size(); i++) {

DatasetDocumentSequence selectedSequence = selectedSequences.get(i);
// With TEI there is no sentence offset
List<Dataset> localEntities = propagateLayoutTokenSequence(
selectedSequence,
entities.get(i),
Expand All @@ -2088,7 +2082,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
placeTaken.get(i),
frequencies,
0
// sentenceOffsetStarts.get(i)
);
if (localEntities != null) {
Collections.sort(localEntities);
Expand Down Expand Up @@ -2550,11 +2543,13 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
String term = nameComponent.getRawForm();
term = term.replace("\n", " ");
term = term.replaceAll("( )+", " ");
term = term.replaceAll("^\"", " ");
term = term.replaceAll("\"$", " ");

if (term.trim().length() == 0)
if (StringUtils.isBlank(term))
continue;

// for safety, we don't propagate something that looks like a stopword with simply an Uppercase first letter
// for safety, we don't propagate something that looks like a stop word with simply an Uppercase first letter
if (FeatureFactory.getInstance().test_first_capital(term) &&
!FeatureFactory.getInstance().test_all_capital(term) &&
DatastetLexicon.getInstance().isEnglishStopword(term.toLowerCase())) {
Expand All @@ -2581,14 +2576,14 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
added.add(termCleaned);
}

// add common trivial variant singular/plurial
if (term.endsWith("dataset") || term.endsWith("Dataset")) {
// add common trivial variant singular/plural
if (StringUtils.endsWithIgnoreCase(term, "dataset")) {
String termAlt = term + "s";
if (!added.contains(termAlt)) {
termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false);
added.add(termAlt);
}
} else if (term.endsWith("datasets") || term.endsWith("Datasets")) {
} else if (StringUtils.endsWithIgnoreCase(term, "datasets")) {
String termAlt = term.substring(0, term.length() - 1);
if (!added.contains(termAlt)) {
termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false);
Expand All @@ -2608,7 +2603,7 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
}

public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, List<LayoutToken> tokens) {
Map<String, Integer> frequencies = new TreeMap<String, Integer>();
Map<String, Integer> frequencies = new TreeMap<>();
for (List<Dataset> datasets : entities) {
if (CollectionUtils.isEmpty(datasets)) {
continue;
Expand All @@ -2622,12 +2617,12 @@ public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, Lis
FastMatcher localTermPattern = new FastMatcher();
localTermPattern.loadTerm(term, DatastetAnalyzer.getInstance());
List<OffsetPosition> results = localTermPattern.matchLayoutToken(tokens, true, true);
// ignore delimiters, but case sensitive matching
// ignore delimiters, but case-sensitive matching
int freq = 0;
if (results != null) {
freq = results.size();
}
frequencies.put(term, Integer.valueOf(freq));
frequencies.put(term, freq);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class DatastetController implements DatastetPaths {
private static final String PDF = "pdf";
private static final String INPUT = "input";
private static final String JSON = "json";
private static final String DISAMBIGUATE = "addParagraphContext";
private static final String DISAMBIGUATE = "disambiguate";
private static final String SEGMENT_SENTENCES = "segmentSentences";

private DatastetConfiguration configuration;
Expand Down Expand Up @@ -144,9 +144,9 @@ public Response processDatasetTEI(
@Produces(MediaType.APPLICATION_JSON)
@POST
public Response processJATS(@FormDataParam(INPUT) InputStream inputStream,
@DefaultValue("0") @FormDataParam(DISAMBIGUATE) String addParagraphContext) {
boolean addParagraphContextBoolean = DatastetServiceUtils.validateBooleanRawParam(addParagraphContext);
return DatastetProcessFile.processDatasetJATS(inputStream, addParagraphContextBoolean);
@DefaultValue("0") @FormDataParam(DISAMBIGUATE) String disambiguate) {
boolean disambiguateBoolean = DatastetServiceUtils.validateBooleanRawParam(disambiguate);
return DatastetProcessFile.processDatasetJATS(inputStream, disambiguateBoolean);
}

@Path(PATH_DATASEER_TEI)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,17 +292,14 @@ public static Response processDatasetPDF(final InputStream inputStream,
* Uploads the origin XML, process it and return the extracted dataset mention objects in JSON.
*
* @param inputStream the data of origin XML
* @param addParagraphContext if true, the full paragraph where an annotation takes place is added
* @return a response object containing the JSON annotations
*/
public static Response processDatasetJATS(final InputStream inputStream,
boolean addParagraphContext) {
public static Response processDatasetJATS(final InputStream inputStream, Boolean disambiguate) {
LOGGER.debug(methodLogIn());
Response response = null;
File originFile = null;
DataseerClassifier classifier = DataseerClassifier.getInstance();
DatasetParser parser = DatasetParser.getInstance(classifier.getDatastetConfiguration());
JsonStringEncoder encoder = JsonStringEncoder.getInstance();

try {
ObjectMapper mapper = new ObjectMapper();
Expand All @@ -318,7 +315,7 @@ public static Response processDatasetJATS(final InputStream inputStream,
} else {
long start = System.currentTimeMillis();

Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processXML(originFile, false, false, addParagraphContext);
Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processXML(originFile, false, disambiguate);
long end = System.currentTimeMillis();

List<List<Dataset>> extractedEntities = null;
Expand Down Expand Up @@ -396,19 +393,18 @@ public static Response processDatasetJATS(final InputStream inputStream,
*
* @param inputStream the data of origin TEI
* @param segmentSentences add sentence segmentation if the TEI was not already segmented
* @param addParagraphContext if true, the full paragraph where an annotation takes place is added
* @return a response object containing the JSON annotations
*/
public static Response processDatasetTEI(final InputStream inputStream,
boolean segmentSentences,
boolean disambiguate,
boolean addParagraphContext) {
public static Response processDatasetTEI(
final InputStream inputStream,
boolean segmentSentences,
boolean disambiguate
) {
LOGGER.debug(methodLogIn());
Response response = null;
File originFile = null;
DataseerClassifier classifier = DataseerClassifier.getInstance();
DatasetParser parser = DatasetParser.getInstance(classifier.getDatastetConfiguration());
JsonStringEncoder encoder = JsonStringEncoder.getInstance();

try {
ObjectMapper mapper = new ObjectMapper();
Expand All @@ -423,7 +419,7 @@ public static Response processDatasetTEI(final InputStream inputStream,
response = Response.status(Status.INTERNAL_SERVER_ERROR).build();
} else {
long start = System.currentTimeMillis();
Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processTEI(originFile, segmentSentences, disambiguate, addParagraphContext);
Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processTEI(originFile, segmentSentences, disambiguate);
long end = System.currentTimeMillis();

List<List<Dataset>> extractedEntities = null;
Expand Down Expand Up @@ -472,11 +468,6 @@ public static Response processDatasetTEI(final InputStream inputStream,
response = Response.status(Status.NO_CONTENT).build();
} else {
response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
/*response = Response
.ok()
.type("application/json")
.entity(retValString)
.build();*/
}
}

Expand Down Expand Up @@ -515,8 +506,4 @@ private static boolean validateTrueFalseParam(String param) {
public static boolean isResultOK(String result) {
return StringUtils.isBlank(result) ? false : true;
}

public static Response processDatasetTEI(InputStream inputStream, boolean segmentSentences, boolean addParagraphContextBoolean) {
return processDatasetTEI(inputStream, segmentSentences, false, addParagraphContextBoolean);
}
}

0 comments on commit b54c567

Please sign in to comment.