Skip to content

Commit

Permalink
cosmetics
Browse files Browse the repository at this point in the history
(cherry picked from commit 0a5cedd)
  • Loading branch information
lfoppiano committed Oct 22, 2024
1 parent 774dd78 commit b18454b
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -234,17 +234,17 @@ public List<List<Dataset>> processing(List<DatasetDocumentSequence> datasetDocum
for (Dataset entity : localDatasets) {
if (entity.getDatasetName() != null) {
String term = entity.getDatasetName().getNormalizedForm();
if (term == null || term.length() == 0) {
indexToBeFiltered.add(Integer.valueOf(k));
if (StringUtils.isBlank(term)) {
indexToBeFiltered.add(k);
} else if (DatastetLexicon.getInstance().isEnglishStopword(term)) {
indexToBeFiltered.add(Integer.valueOf(k));
indexToBeFiltered.add(k);
} else if (DatastetLexicon.getInstance().isBlackListedNamedDataset(term.toLowerCase())) {
indexToBeFiltered.add(Integer.valueOf(k));
indexToBeFiltered.add(k);
}
}
k++;
}
if (indexToBeFiltered.size() > 0) {
if (CollectionUtils.isNotEmpty(indexToBeFiltered)) {
for (int j = indexToBeFiltered.size() - 1; j >= 0; j--) {
localDatasets.remove(indexToBeFiltered.get(j).intValue());
}
Expand Down Expand Up @@ -1596,7 +1596,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
XPath xPath = XPathFactory.newInstance().newXPath();

try {
org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate("//*[local-name() = 'titleStmt']/*[local-name() = 'title']",
org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate(
"//*[local-name() = 'titleStmt']/*[local-name() = 'title']",
doc,
XPathConstants.NODE);
if (titleNode == null) {
Expand Down Expand Up @@ -1729,7 +1730,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
// Annex might contain misclassified relevant sections
try {
String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='annex']]/*[local-name() = 'div']";
org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression,
org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(
expression,
doc,
XPathConstants.NODESET);
for (int i = 0; i < bodyNodeList.getLength(); i++) {
Expand Down Expand Up @@ -1783,14 +1785,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
// specific section types statement
DatastetAnalyzer datastetAnalyzer = DatastetAnalyzer.getInstance();

List<String> specificSectionTypesAnnex = Arrays.asList("availability", "acknowledgement", "funding");
// Looks like acknowledgment and funding may be misleading
List<String> specificSectionTypesAnnex = Arrays.asList("availability", "data-availability");

List<DatasetDocumentSequence> availabilitySequences = new ArrayList<>();
for (String sectionType : specificSectionTypesAnnex) {
try {
String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']";
expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']";
org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression,
org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(
expression,
doc,
XPathConstants.NODESET);
for (int i = 0; i < annexNodeList.getLength(); i++) {
Expand Down

0 comments on commit b18454b

Please sign in to comment.