Skip to content

Commit

Permalink
fix regressions in the way we attach references from TEI
Browse files Browse the repository at this point in the history
(cherry picked from commit 1e658fd)
  • Loading branch information
lfoppiano committed Oct 22, 2024
1 parent b18454b commit 962f7eb
Showing 1 changed file with 41 additions and 26 deletions.
67 changes: 41 additions & 26 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -1965,16 +1965,25 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do


// We need to link the references and their callout
List<BiblioComponent> bibRefComponents = new ArrayList<>();
List<List<BiblioComponent>> referencesAsBiblioComponentSequences = new ArrayList<>();
Map<String, BiblioItem> biblioRefMap = new HashMap<>();

List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream()
.map(DatasetDocumentSequence::getReferences)
.filter(map -> map.values().stream()
.anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE)))
.toList();
List<Map<String, Triple<OffsetPosition, String, String>>> referencesInSequences = selectedSequences.stream()
.map(sequence -> sequence.getReferences().entrySet().stream()
.filter(entry -> BIBLIO_CALLOUT_TYPE.equals(entry.getValue().getRight()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)))
.collect(Collectors.toList());

// List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream()
// .map(DatasetDocumentSequence::getReferences)
// .filter(map -> map.values().stream()
// .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE)))
// .toList();

// We iterate over the sequences, and transform each reference into a BiblioComponent
for (Map<String, Triple<OffsetPosition, String, String>> ref : referencesInSequences) {
List<BiblioComponent> referencesInSequence = new ArrayList<>();

for (Map<String, Triple<OffsetPosition, String, String>> ref : referencesList) {
for (String refText : ref.keySet()) {
Triple<OffsetPosition, String, String> infos = ref.get(refText);

Expand All @@ -1984,19 +1993,22 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target);
if (referenceInformation != null) {
BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
refText = refText.replaceAll("[\\[\\], ]+", "");
String refTextClean = refText.replaceAll("[\\[\\], ]+", "");

biblioRefMap.put(refText, biblioItem);
BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", "")));
biblioRefMap.put(refTextClean, biblioItem);
BiblioComponent biblioComponent = new BiblioComponent(
biblioItem, Integer.parseInt(target.replace("b", ""))
);
biblioComponent.setRawForm(refText);
biblioComponent.setOffsetStart(position.start);
biblioComponent.setOffsetEnd(position.end);
// TODO: fetch the coords if they are in the TEI
// List<BoundingBox> boundingBoxes = BoundingBoxCalculator.calculate(refTokens);
// biblioComponent.setBoundingBoxes(boundingBoxes);
bibRefComponents.add(biblioComponent);
referencesInSequence.add(biblioComponent);
}
}
referencesAsBiblioComponentSequences.add(referencesInSequence);
}

// Dataset Recognition
Expand Down Expand Up @@ -2143,9 +2155,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do


// Enhance information in dataset entities
if (CollectionUtils.isNotEmpty(bibRefComponents)) {
if (CollectionUtils.isNotEmpty(referencesAsBiblioComponentSequences)) {
// attach references to dataset entities
entities = attachRefBibSimple(entities, bibRefComponents);
entities = attachRefBibSimple(entities, referencesAsBiblioComponentSequences);
}

// consolidate the attached ref bib (we don't consolidate all bibliographical references
Expand Down Expand Up @@ -2395,36 +2407,39 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli
* Try to attach relevant bib ref component to dataset entities, this does not use the global offset as in the
* TEI all references' offsets are local to the sentence
*/
public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) {
return attachRefBib(entities, refBibComponents, 5);
public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<List<BiblioComponent>> refBibComponents) {
return attachRefBibSimple(entities, refBibComponents, 5);
}

public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) {
public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> datasetsSequences, List<List<BiblioComponent>> referencesSequences, int distance) {

// we anchor the process to the dataset names and aggregate other closest components on the right
// if we cross a bib ref component we attach it, if a bib ref component is just after the last
// component of the entity group, we attach it
for (List<Dataset> datasets : entities) {
for (Dataset entity : datasets) {
if (entity.getDatasetName() == null)
for (int seqIdx = 0; seqIdx < datasetsSequences.size(); seqIdx++) {
List<Dataset> datasets = datasetsSequences.get(seqIdx);
List<BiblioComponent> references = referencesSequences.get(seqIdx);

for (Dataset dataset : datasets) {
if (dataset.getDatasetName() == null)
continue;

// find the name component and the offset
DatasetComponent nameComponent = entity.getDatasetName();
int pos = nameComponent.getOffsetEnd();
DatasetComponent nameComponent = dataset.getDatasetName();
int datasetEndPosition = nameComponent.getOffsetEnd();

// find included or just next bib ref callout
List<BiblioComponent> relatedReferences = refBibComponents.stream()
.filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance)
// find included or just next bib ref callout within a distance of 5 characters
List<BiblioComponent> relatedReferences = references.stream()
.filter(ref -> ref.getOffsetStart() >= datasetEndPosition && ref.getOffsetStart() <= datasetEndPosition + distance)
.collect(Collectors.toList());

if (CollectionUtils.isNotEmpty(relatedReferences)) {
entity.setBibRefs(relatedReferences);
dataset.setBibRefs(relatedReferences);
}
}
}

return entities;
return datasetsSequences;
}

public List<List<OffsetPosition>> preparePlaceTaken(List<List<Dataset>> entities) {
Expand Down

0 comments on commit 962f7eb

Please sign in to comment.