Skip to content

Commit

Permalink
retrieve URLs from the TEI XML in all the sections that are of interest
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Oct 13, 2024
1 parent 538c0eb commit da6746c
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -1590,7 +1590,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
//Extract relevant section from the TEI
// Title, abstract, keywords

// If we process the TEI, at this point the document should be already segmented correctly.
// TODO: remove this If we process the TEI, at this point the document should be already segmented correctly.
boolean extractParagraphs = false;

XPath xPath = XPathFactory.newInstance().newXPath();
Expand Down Expand Up @@ -1770,6 +1770,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsNamedDatasets(true);
localSequence.setRelevantSectionsImplicitDatasets(false);
}

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight();
localSequence.setReferences(referencesInText);
}
}

Expand Down Expand Up @@ -1803,6 +1806,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(true);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down Expand Up @@ -1854,6 +1860,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsNamedDatasets(true);
localSequence.setRelevantSectionsImplicitDatasets(false);
}

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight();
localSequence.setReferences(referencesInText);
}
}

Expand Down Expand Up @@ -1881,6 +1890,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(true);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down Expand Up @@ -1911,6 +1923,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(false);
selectedSequences.add(localSequence);
availabilitySequences.add(localSequence);

Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}

} catch (XPathExpressionException e) {
Expand Down

0 comments on commit da6746c

Please sign in to comment.