From b18454b5ca149e1bd60a37b6bc83ed6c6083cb6d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 22 Oct 2024 04:09:29 +0200 Subject: [PATCH] cosmetics (cherry picked from commit 0a5cedd91434a345bc79657236cf1dbe142650e6) --- .../grobid/core/engines/DatasetParser.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 379cd00..5323e7e 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -234,17 +234,17 @@ public List> processing(List datasetDocum for (Dataset entity : localDatasets) { if (entity.getDatasetName() != null) { String term = entity.getDatasetName().getNormalizedForm(); - if (term == null || term.length() == 0) { - indexToBeFiltered.add(Integer.valueOf(k)); + if (StringUtils.isBlank(term)) { + indexToBeFiltered.add(k); } else if (DatastetLexicon.getInstance().isEnglishStopword(term)) { - indexToBeFiltered.add(Integer.valueOf(k)); + indexToBeFiltered.add(k); } else if (DatastetLexicon.getInstance().isBlackListedNamedDataset(term.toLowerCase())) { - indexToBeFiltered.add(Integer.valueOf(k)); + indexToBeFiltered.add(k); } } k++; } - if (indexToBeFiltered.size() > 0) { + if (CollectionUtils.isNotEmpty(indexToBeFiltered)) { for (int j = indexToBeFiltered.size() - 1; j >= 0; j--) { localDatasets.remove(indexToBeFiltered.get(j).intValue()); } @@ -1596,7 +1596,8 @@ public Pair>, List> processTEIDocument(org.w3c.do XPath xPath = XPathFactory.newInstance().newXPath(); try { - org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate("//*[local-name() = 'titleStmt']/*[local-name() = 'title']", + org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate( + "//*[local-name() = 'titleStmt']/*[local-name() = 'title']", doc, XPathConstants.NODE); if (titleNode == null) { @@ -1729,7 +1730,8 @@ public Pair>, List> processTEIDocument(org.w3c.do // Annex might contain misclassified relevant sections try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='annex']]/*[local-name() = 'div']"; - org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate( + expression, doc, XPathConstants.NODESET); for (int i = 0; i < bodyNodeList.getLength(); i++) { @@ -1783,14 +1785,16 @@ public Pair>, List> processTEIDocument(org.w3c.do // specific section types statement DatastetAnalyzer datastetAnalyzer = DatastetAnalyzer.getInstance(); - List specificSectionTypesAnnex = Arrays.asList("availability", "acknowledgement", "funding"); + // Looks like acknowledgment and funding may be misleading + List specificSectionTypesAnnex = Arrays.asList("availability", "data-availability"); List availabilitySequences = new ArrayList<>(); for (String sectionType : specificSectionTypesAnnex) { try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; - org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate( + expression, doc, XPathConstants.NODESET); for (int i = 0; i < annexNodeList.getLength(); i++) {