cleanup API

DataSeer · Oct 16, 2024 · b54c567 · b54c567
1 parent 127fbc2
commit b54c567
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 54 deletions.
diff --git a/build.gradle b/build.gradle
@@ -87,7 +87,6 @@ dependencies {
     implementation "io.dropwizard.metrics:metrics-core:4.0.0"
     implementation "io.dropwizard.metrics:metrics-servlets:4.0.0"
 
-    //Parsing xml/json
     //Parsing xml/json
     implementation group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: '2.10.1'
     implementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.10.1'

diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -592,7 +592,7 @@ public List<Dataset> processingString(String input, boolean disambiguate) {
     }
 
     private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSentences) {
-        // pre-process classification of every sentences in batch
+        // pre-process classification of every sentence in batch
         if (this.dataseerClassifier == null)
             dataseerClassifier = DataseerClassifier.getInstance();
 
@@ -629,8 +629,8 @@ private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSen
                                 String localSentence = classificationNode.get("text").textValue();
                                 // the following should never happen
                                 if (!localSentence.equals(allSentences.get(totalClassificationNodes))) {
-                                    System.out.println("sentence, got: " + localSentence);
-                                    System.out.println("\texpecting: " + allSentences.get(totalClassificationNodes));
+                                    LOGGER.warn("sentence, got: " + localSentence);
+                                    LOGGER.warn("\texpecting: " + allSentences.get(totalClassificationNodes));
                                 }
                             } else if (!field.equals("no_dataset")) {
                                 scoresPerDatatypes.put(field, classificationNode.get(field).doubleValue());
@@ -658,7 +658,7 @@ private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSen
             }
 
         } catch (Exception e) {
-            e.printStackTrace();
+            LOGGER.error("General exception occurred during the classification with the DataSeer classifier", e);
         }
 
         return results;
@@ -1465,7 +1465,7 @@ public List<List<Dataset>> markDAS(List<List<Dataset>> entities, List<LayoutToke
         return entities;
     }
 
-    public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException {
+    public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
         try {
             String tei = processXML(file);
@@ -1480,15 +1480,15 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean
             // TODO: call pub2TEI with sentence segmentation
 
             // It's likely that JATS don't have sentences
-            resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext);
+            resultExtraction = processTEIDocument(document, disambiguate);
         } catch (final Exception exp) {
             LOGGER.error("An error occured while processing the following XML file: "
                     + file.getPath(), exp);
         }
         return resultExtraction;
     }
 
-    public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException {
+    public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean segmentSentences, boolean disambiguate) throws IOException {
         Pair<List<List<Dataset>>, List<BibDataSet>> resultExtraction = null;
         try {
             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -1498,7 +1498,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean
             org.w3c.dom.Element root = document.getDocumentElement();
             if (segmentSentences)
                 segment(document, root);
-            resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext);
+            resultExtraction = processTEIDocument(document, disambiguate);
             //tei = restoreDomParserAttributeBug(tei); 
 
         } catch (final Exception exp) {
@@ -1529,7 +1529,6 @@ public String processXML(File file) throws Exception {
 
             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
             factory.setNamespaceAware(true);
-            DocumentBuilder builder = factory.newDocumentBuilder();
             tei = FileUtils.readFileToString(new File(newFilePath), UTF_8);
 
         } catch (final Exception exp) {
@@ -1550,8 +1549,7 @@ public String processXML(File file) throws Exception {
      */
     public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String documentAsString,
                                                                           boolean segmentSentences,
-                                                                          boolean disambiguate,
-                                                                          boolean addParagraphContext) {
+                                                                          boolean disambiguate) {
 
         Pair<List<List<Dataset>>, List<BibDataSet>> tei = null;
         try {
@@ -1564,12 +1562,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
             if (segmentSentences)
                 segment(document, root);
 
-            tei = processTEIDocument(document, disambiguate, addParagraphContext);
-        } catch (ParserConfigurationException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        } catch (SAXException e) {
+            tei = processTEIDocument(document, disambiguate);
+        } catch (ParserConfigurationException | IOException | SAXException e) {
             e.printStackTrace();
         }
         return tei;
@@ -1582,8 +1576,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc
      * LF: This method attempt to reproduce the extraction from PDF in processPDF but with an already extracted TEI as input
      */
     public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.dom.Document doc,
-                                                                          boolean disambiguate,
-                                                                          boolean addParagraphContext) {
+                                                                          boolean disambiguate) {
 
         List<DatasetDocumentSequence> selectedSequences = new ArrayList<>();
 
@@ -2080,6 +2073,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
         for (int i = 0; i < selectedSequences.size(); i++) {
 
             DatasetDocumentSequence selectedSequence = selectedSequences.get(i);
+            // With TEI there is no sentence offset
             List<Dataset> localEntities = propagateLayoutTokenSequence(
                     selectedSequence,
                     entities.get(i),
@@ -2088,7 +2082,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                     placeTaken.get(i),
                     frequencies,
                     0
-//                    sentenceOffsetStarts.get(i)
             );
             if (localEntities != null) {
                 Collections.sort(localEntities);
@@ -2550,11 +2543,13 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
                 String term = nameComponent.getRawForm();
                 term = term.replace("\n", " ");
                 term = term.replaceAll("( )+", " ");
+                term = term.replaceAll("^\"", " ");
+                term = term.replaceAll("\"$", " ");
 
-                if (term.trim().length() == 0)
+                if (StringUtils.isBlank(term))
                     continue;
 
-                // for safety, we don't propagate something that looks like a stopword with simply an Uppercase first letter
+                // for safety, we don't propagate something that looks like a stop word with simply an Uppercase first letter
                 if (FeatureFactory.getInstance().test_first_capital(term) &&
                         !FeatureFactory.getInstance().test_all_capital(term) &&
                         DatastetLexicon.getInstance().isEnglishStopword(term.toLowerCase())) {
@@ -2581,14 +2576,14 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
                     added.add(termCleaned);
                 }
 
-                // add common trivial variant singular/plurial
-                if (term.endsWith("dataset") || term.endsWith("Dataset")) {
+                // add common trivial variant singular/plural
+                if (StringUtils.endsWithIgnoreCase(term, "dataset")) {
                     String termAlt = term + "s";
                     if (!added.contains(termAlt)) {
                         termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false);
                         added.add(termAlt);
                     }
-                } else if (term.endsWith("datasets") || term.endsWith("Datasets")) {
+                } else if (StringUtils.endsWithIgnoreCase(term, "datasets")) {
                     String termAlt = term.substring(0, term.length() - 1);
                     if (!added.contains(termAlt)) {
                         termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false);
@@ -2608,7 +2603,7 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) {
     }
 
     public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, List<LayoutToken> tokens) {
-        Map<String, Integer> frequencies = new TreeMap<String, Integer>();
+        Map<String, Integer> frequencies = new TreeMap<>();
         for (List<Dataset> datasets : entities) {
             if (CollectionUtils.isEmpty(datasets)) {
                 continue;
@@ -2622,12 +2617,12 @@ public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, Lis
                     FastMatcher localTermPattern = new FastMatcher();
                     localTermPattern.loadTerm(term, DatastetAnalyzer.getInstance());
                     List<OffsetPosition> results = localTermPattern.matchLayoutToken(tokens, true, true);
-                    // ignore delimiters, but case sensitive matching
+                    // ignore delimiters, but case-sensitive matching
                     int freq = 0;
                     if (results != null) {
                         freq = results.size();
                     }
-                    frequencies.put(term, Integer.valueOf(freq));
+                    frequencies.put(term, freq);
                 }
             }
         }

diff --git a/src/main/java/org/grobid/service/controller/DatastetController.java b/src/main/java/org/grobid/service/controller/DatastetController.java
@@ -32,7 +32,7 @@ public class DatastetController implements DatastetPaths {
     private static final String PDF = "pdf";
     private static final String INPUT = "input";
     private static final String JSON = "json";
-    private static final String DISAMBIGUATE = "addParagraphContext";
+    private static final String DISAMBIGUATE = "disambiguate";
     private static final String SEGMENT_SENTENCES = "segmentSentences";
 
     private DatastetConfiguration configuration;
@@ -144,9 +144,9 @@ public Response processDatasetTEI(
     @Produces(MediaType.APPLICATION_JSON)
     @POST
     public Response processJATS(@FormDataParam(INPUT) InputStream inputStream,
-                                @DefaultValue("0") @FormDataParam(DISAMBIGUATE) String addParagraphContext) {
-        boolean addParagraphContextBoolean = DatastetServiceUtils.validateBooleanRawParam(addParagraphContext);
-        return DatastetProcessFile.processDatasetJATS(inputStream, addParagraphContextBoolean);
+                                @DefaultValue("0") @FormDataParam(DISAMBIGUATE) String disambiguate) {
+        boolean disambiguateBoolean = DatastetServiceUtils.validateBooleanRawParam(disambiguate);
+        return DatastetProcessFile.processDatasetJATS(inputStream, disambiguateBoolean);
     }
 
     @Path(PATH_DATASEER_TEI)

diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java
@@ -292,17 +292,14 @@ public static Response processDatasetPDF(final InputStream inputStream,
      * Uploads the origin XML, process it and return the extracted dataset mention objects in JSON.
      *
      * @param inputStream the data of origin XML
-     * @param addParagraphContext if true, the full paragraph where an annotation takes place is added
      * @return a response object containing the JSON annotations
      */
-    public static Response processDatasetJATS(final InputStream inputStream,
-                                        boolean addParagraphContext) {
+    public static Response processDatasetJATS(final InputStream inputStream, Boolean disambiguate) {
         LOGGER.debug(methodLogIn()); 
         Response response = null;
         File originFile = null;
         DataseerClassifier classifier = DataseerClassifier.getInstance();
         DatasetParser parser = DatasetParser.getInstance(classifier.getDatastetConfiguration());
-        JsonStringEncoder encoder = JsonStringEncoder.getInstance();
 
         try {
             ObjectMapper mapper = new ObjectMapper();
@@ -318,7 +315,7 @@ public static Response processDatasetJATS(final InputStream inputStream,
             } else {
                 long start = System.currentTimeMillis();
 
-                Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processXML(originFile, false, false, addParagraphContext);
+                Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processXML(originFile, false, disambiguate);
                 long end = System.currentTimeMillis();
 
                 List<List<Dataset>> extractedEntities = null;
@@ -396,19 +393,18 @@ public static Response processDatasetJATS(final InputStream inputStream,
      *
      * @param inputStream the data of origin TEI
      * @param segmentSentences add sentence segmentation if the TEI was not already segmented
-     * @param addParagraphContext if true, the full paragraph where an annotation takes place is added
      * @return a response object containing the JSON annotations
      */
-    public static Response processDatasetTEI(final InputStream inputStream,
-                                             boolean segmentSentences,
-                                             boolean disambiguate,
-                                             boolean addParagraphContext) {
+    public static Response processDatasetTEI(
+            final InputStream inputStream,
+            boolean segmentSentences,
+            boolean disambiguate
+    ) {
         LOGGER.debug(methodLogIn()); 
         Response response = null;
         File originFile = null;
         DataseerClassifier classifier = DataseerClassifier.getInstance();
         DatasetParser parser = DatasetParser.getInstance(classifier.getDatastetConfiguration());
-        JsonStringEncoder encoder = JsonStringEncoder.getInstance();
 
         try {
             ObjectMapper mapper = new ObjectMapper();
@@ -423,7 +419,7 @@ public static Response processDatasetTEI(final InputStream inputStream,
                 response = Response.status(Status.INTERNAL_SERVER_ERROR).build();
             } else {
                 long start = System.currentTimeMillis();
-                Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processTEI(originFile, segmentSentences, disambiguate, addParagraphContext);
+                Pair<List<List<Dataset>>, List<BibDataSet>> extractionResult = parser.processTEI(originFile, segmentSentences, disambiguate);
                 long end = System.currentTimeMillis();
 
                 List<List<Dataset>> extractedEntities = null;
@@ -472,11 +468,6 @@ public static Response processDatasetTEI(final InputStream inputStream,
                     response = Response.status(Status.NO_CONTENT).build();
                 } else {
                     response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
-                    /*response = Response
-                            .ok()
-                            .type("application/json")
-                            .entity(retValString)
-                            .build();*/
                 }
             }
 
@@ -515,8 +506,4 @@ private static boolean validateTrueFalseParam(String param) {
     public static boolean isResultOK(String result) {
         return StringUtils.isBlank(result) ? false : true;
     }
-
-    public static Response processDatasetTEI(InputStream inputStream, boolean segmentSentences, boolean addParagraphContextBoolean) {
-        return processDatasetTEI(inputStream, segmentSentences, false, addParagraphContextBoolean);
-    }
 }