From 1e658fda090cab67a6ad94633683c4ba7e83a014 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 22 Oct 2024 09:46:24 +0200 Subject: [PATCH] fix regressions in the way we attach references from TEI --- .../grobid/core/engines/DatasetParser.java | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index eec1f4c..58fe8fd 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1958,16 +1958,25 @@ public Pair>, List> processTEIDocument(org.w3c.do // We need to link the references and their callout - List bibRefComponents = new ArrayList<>(); + List> referencesAsBiblioComponentSequences = new ArrayList<>(); Map biblioRefMap = new HashMap<>(); - List>> referencesList = selectedSequences.stream() - .map(DatasetDocumentSequence::getReferences) - .filter(map -> map.values().stream() - .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) - .toList(); + List>> referencesInSequences = selectedSequences.stream() + .map(sequence -> sequence.getReferences().entrySet().stream() + .filter(entry -> BIBLIO_CALLOUT_TYPE.equals(entry.getValue().getRight())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))) + .collect(Collectors.toList()); + +// List>> referencesList = selectedSequences.stream() +// .map(DatasetDocumentSequence::getReferences) +// .filter(map -> map.values().stream() +// .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) +// .toList(); + + // We iterate over the sequences, and transform each reference into a BiblioComponent + for (Map> ref : referencesInSequences) { + List referencesInSequence = new ArrayList<>(); - for (Map> ref : referencesList) { for (String refText : ref.keySet()) { Triple infos = ref.get(refText); @@ -1977,19 +1986,22 @@ public Pair>, List> processTEIDocument(org.w3c.do Pair referenceInformation = referenceMap.get(target); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); - refText = refText.replaceAll("[\\[\\], ]+", ""); + String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); - biblioRefMap.put(refText, biblioItem); - BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", ""))); + biblioRefMap.put(refTextClean, biblioItem); + BiblioComponent biblioComponent = new BiblioComponent( + biblioItem, Integer.parseInt(target.replace("b", "")) + ); biblioComponent.setRawForm(refText); biblioComponent.setOffsetStart(position.start); biblioComponent.setOffsetEnd(position.end); // TODO: fetch the coords if they are in the TEI // List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); // biblioComponent.setBoundingBoxes(boundingBoxes); - bibRefComponents.add(biblioComponent); + referencesInSequence.add(biblioComponent); } } + referencesAsBiblioComponentSequences.add(referencesInSequence); } // Dataset Recognition @@ -2136,9 +2148,9 @@ public Pair>, List> processTEIDocument(org.w3c.do // Enhance information in dataset entities - if (CollectionUtils.isNotEmpty(bibRefComponents)) { + if (CollectionUtils.isNotEmpty(referencesAsBiblioComponentSequences)) { // attach references to dataset entities - entities = attachRefBibSimple(entities, bibRefComponents); + entities = attachRefBibSimple(entities, referencesAsBiblioComponentSequences); } // consolidate the attached ref bib (we don't consolidate all bibliographical references @@ -2388,36 +2400,39 @@ public List> attachRefBib(List> entities, List> attachRefBibSimple(List> entities, List refBibComponents) { - return attachRefBib(entities, refBibComponents, 5); + public List> attachRefBibSimple(List> entities, List> refBibComponents) { + return attachRefBibSimple(entities, refBibComponents, 5); } - public List> attachRefBibSimple(List> entities, List refBibComponents, int distance) { + public List> attachRefBibSimple(List> datasetsSequences, List> referencesSequences, int distance) { // we anchor the process to the dataset names and aggregate other closest components on the right // if we cross a bib ref component we attach it, if a bib ref component is just after the last // component of the entity group, we attach it - for (List datasets : entities) { - for (Dataset entity : datasets) { - if (entity.getDatasetName() == null) + for (int seqIdx = 0; seqIdx < datasetsSequences.size(); seqIdx++) { + List datasets = datasetsSequences.get(seqIdx); + List references = referencesSequences.get(seqIdx); + + for (Dataset dataset : datasets) { + if (dataset.getDatasetName() == null) continue; // find the name component and the offset - DatasetComponent nameComponent = entity.getDatasetName(); - int pos = nameComponent.getOffsetEnd(); + DatasetComponent nameComponent = dataset.getDatasetName(); + int datasetEndPosition = nameComponent.getOffsetEnd(); - // find included or just next bib ref callout - List relatedReferences = refBibComponents.stream() - .filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance) + // find included or just next bib ref callout within a distance of 5 characters + List relatedReferences = references.stream() + .filter(ref -> ref.getOffsetStart() >= datasetEndPosition && ref.getOffsetStart() <= datasetEndPosition + distance) .collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(relatedReferences)) { - entity.setBibRefs(relatedReferences); + dataset.setBibRefs(relatedReferences); } } } - return entities; + return datasetsSequences; } public List> preparePlaceTaken(List> entities) {