Skip to content

Commit

Permalink
retrieve URLs from the TEI XML in all the sections that are of interest
Browse files Browse the repository at this point in the history
(cherry picked from commit da6746c)
  • Loading branch information
lfoppiano committed Oct 13, 2024
1 parent 7b6fe06 commit 2162720
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -1590,7 +1590,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
//Extract relevant section from the TEI
// Title, abstract, keywords

// If we process the TEI, at this point the document should be already segmented correctly.
// TODO: remove this If we process the TEI, at this point the document should be already segmented correctly.
boolean extractParagraphs = false;

XPath xPath = XPathFactory.newInstance().newXPath();
Expand Down Expand Up @@ -1770,6 +1770,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsNamedDatasets(true);
localSequence.setRelevantSectionsImplicitDatasets(false);
}

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight();
localSequence.setReferences(referencesInText);
}
}

Expand Down Expand Up @@ -1803,6 +1806,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(true);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down Expand Up @@ -1854,6 +1860,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsNamedDatasets(true);
localSequence.setRelevantSectionsImplicitDatasets(false);
}

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight();
localSequence.setReferences(referencesInText);
}
}

Expand Down Expand Up @@ -1881,6 +1890,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(true);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down Expand Up @@ -1911,6 +1923,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(false);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down

0 comments on commit 2162720

Please sign in to comment.