From 27194da5c8855cec60104c51e8e9b951eeb420a7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 14 Oct 2024 03:18:00 +0200 Subject: [PATCH] fix references extraction --- .../java/org/grobid/core/data/Dataset.java | 2 +- .../grobid/core/engines/DatasetParser.java | 121 +++++++++++------- .../controller/DatastetProcessFile.java | 34 ++--- 3 files changed, 92 insertions(+), 65 deletions(-) diff --git a/src/main/java/org/grobid/core/data/Dataset.java b/src/main/java/org/grobid/core/data/Dataset.java index 250494f..483f658 100644 --- a/src/main/java/org/grobid/core/data/Dataset.java +++ b/src/main/java/org/grobid/core/data/Dataset.java @@ -279,7 +279,7 @@ public void setBibRefs(List bibRefs) { public void addBibRef(BiblioComponent bibRef) { if (bibRefs == null) { - bibRefs = new ArrayList(); + bibRefs = new ArrayList<>(); } bibRefs.add(bibRef); } diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index ad21a91..94e3381 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1690,8 +1690,7 @@ public Pair>, List> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsImplicitDatasets(true); selectedSequences.add(localSequence); - // Capture URLs if available - + // Capture URLs and references if available Map> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); localSequence.setReferences(referencesInText); } @@ -1873,7 +1872,7 @@ public Pair>, List> processTEIDocument(org.w3c.do try { - String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type-> "not(contains(@type, '"+type+"'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type -> "not(contains(@type, '" + type + "'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, doc, @@ -1981,6 +1980,8 @@ public Pair>, List> processTEIDocument(org.w3c.do Pair referenceInformation = referenceMap.get(target); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); + refText = refText.replaceAll("[\\[\\], ]+", ""); + biblioRefMap.put(refText, biblioItem); BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", ""))); biblioComponent.setRawForm(refText); @@ -1999,8 +2000,6 @@ public Pair>, List> processTEIDocument(org.w3c.do List allDocumentTokens = new ArrayList<>(); - int startingOffset = 0; - List sentenceOffsetStarts = new ArrayList<>(); for (DatasetDocumentSequence sequence : selectedSequences) { List sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence.getText()); sequence.setTokens(sentenceTokens); @@ -2028,34 +2027,21 @@ public Pair>, List> processTEIDocument(org.w3c.do // } // }); // - int finalStartingOffset = startingOffset; - List sentenceTokenAllTokens = sentenceTokens.stream() - .map(lt -> { - lt.setOffset(lt.getOffset() + finalStartingOffset); - return lt; - }) - .collect(Collectors.toList()); +// int finalStartingOffset = startingOffset; +// List sentenceTokenAllTokens = sentenceTokens.stream() +// .map(lt -> { +// lt.setOffset(lt.getOffset() + finalStartingOffset); +// return lt; +// }) +// .collect(Collectors.toList()); - allDocumentTokens.addAll(sentenceTokenAllTokens); - sentenceOffsetStarts.add(startingOffset); - startingOffset += sequence.getText().length(); + allDocumentTokens.addAll(sentenceTokens); } - List> datasetLists = processing(selectedSequences, false); + List> datasetLists = processing(selectedSequences, disambiguate); entities.addAll(datasetLists); - for (int i = 0; i < entities.size(); i++) { - List datasets = entities.get(i); - if (datasets == null) { - continue; - } - for (Dataset dataset : datasets) { - if (dataset == null) - continue; - dataset.setGlobalContextOffset(sentenceOffsetStarts.get(i)); - } - } // TODO make sure that selectedSequences == allSentences above in the processPDF? List allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList(); @@ -2101,7 +2087,8 @@ public Pair>, List> processTEIDocument(org.w3c.do termPattern, placeTaken.get(i), frequencies, - sentenceOffsetStarts.get(i) + 0 +// sentenceOffsetStarts.get(i) ); if (localEntities != null) { Collections.sort(localEntities); @@ -2154,7 +2141,7 @@ public Pair>, List> processTEIDocument(org.w3c.do // Enhance information in dataset entities if (CollectionUtils.isNotEmpty(bibRefComponents)) { // attach references to dataset entities - entities = attachRefBib(entities, bibRefComponents); + entities = attachRefBibSimple(entities, bibRefComponents); } // consolidate the attached ref bib (we don't consolidate all bibliographical references @@ -2168,7 +2155,7 @@ public Pair>, List> processTEIDocument(org.w3c.do for (BiblioComponent bibRef : bibRefs) { Integer refKeyVal = bibRef.getRefKey(); if (!consolidated.contains(refKeyVal)) { - BiblioItem biblioItem = biblioRefMap.get(refKeyVal); + BiblioItem biblioItem = biblioRefMap.get(String.valueOf(refKeyVal)); BibDataSet biblioDataSet = new BibDataSet(); biblioDataSet.setResBib(biblioItem); citationsToConsolidate.add(biblioDataSet); @@ -2179,19 +2166,21 @@ public Pair>, List> processTEIDocument(org.w3c.do } } - try { - Consolidation consolidator = Consolidation.getInstance(); - Map resConsolidation = consolidator.consolidate(citationsToConsolidate); - for (int j = 0; j < citationsToConsolidate.size(); j++) { - BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); - BiblioItem bibo = resConsolidation.get(j); - if (bibo != null) { - BiblioItem.correct(resCitation, bibo); + if (StringUtils.isNotBlank(datastetConfiguration.getGluttonHost())) { + try { + Consolidation consolidator = Consolidation.getInstance(); + Map resConsolidation = consolidator.consolidate(citationsToConsolidate); + for (int j = 0; j < citationsToConsolidate.size(); j++) { + BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); + BiblioItem bibo = resConsolidation.get(j); + if (bibo != null) { + BiblioItem.correct(resCitation, bibo); + } } + } catch (Exception e) { + throw new GrobidException( + "An exception occurred while running consolidation on bibliographical references.", e); } - } catch (Exception e) { - throw new GrobidException( - "An exception occured while running consolidation on bibliographical references.", e); } // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. @@ -2230,8 +2219,7 @@ public Pair>, List> processTEIDocument(org.w3c.do entities = DatasetContextClassifier.getInstance(datastetConfiguration) .classifyDocumentContexts(entities); - List resCitations = List.of(); - return Pair.of(entities, resCitations); + return Pair.of(entities, citationsToConsolidate); } private static String normalize(String text) { @@ -2355,10 +2343,11 @@ public static boolean checkDASAnnex(List annexTokens) { return false; } - /** - * Try to attach relevant bib ref component to dataset entities - */ public List> attachRefBib(List> entities, List refBibComponents) { + return attachRefBib(entities, refBibComponents, 5); + } + + public List> attachRefBib(List> entities, List refBibComponents, int distance) { // we anchor the process to the dataset names and aggregate other closest components on the right // if we cross a bib ref component we attach it, if a bib ref component is just after the last @@ -2387,7 +2376,7 @@ public List> attachRefBib(List> entities, List= pos) && - (refBib.getOffsetStart() <= endPos + 5)) { + (refBib.getOffsetStart() <= endPos + distance)) { entity.addBibRef(refBib); endPos = refBib.getOffsetEnd(); } @@ -2398,6 +2387,42 @@ public List> attachRefBib(List> entities, List> attachRefBibSimple(List> entities, List refBibComponents) { + return attachRefBib(entities, refBibComponents, 5); + } + + public List> attachRefBibSimple(List> entities, List refBibComponents, int distance) { + + // we anchor the process to the dataset names and aggregate other closest components on the right + // if we cross a bib ref component we attach it, if a bib ref component is just after the last + // component of the entity group, we attach it + for (List datasets : entities) { + for (Dataset entity : datasets) { + if (entity.getDatasetName() == null) + continue; + + // find the name component and the offset + DatasetComponent nameComponent = entity.getDatasetName(); + int pos = nameComponent.getOffsetEnd(); + + // find included or just next bib ref callout + List relatedReferences = refBibComponents.stream() + .filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance) + .collect(Collectors.toList()); + + if (CollectionUtils.isNotEmpty(relatedReferences)) { + entity.setBibRefs(relatedReferences); + } + } + } + + return entities; + } + public List> preparePlaceTaken(List> entities) { List> localPositions = new ArrayList<>(); for (List datasets : entities) { @@ -2690,7 +2715,7 @@ public List propagateLayoutTokenSequence(DatasetDocumentSequence sequen entity.getSequenceIdentifiers().addAll(name.getSequenceIdentifiers()); //entity.setType(DatastetLexicon.Dataset_Type.DATASET); entity.setPropagated(true); - entity.setGlobalContextOffset(sentenceOffsetStart); +// entity.setGlobalContextOffset(sentenceOffsetStart); if (entities == null) entities = new ArrayList<>(); entities.add(entity); diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index 7e67afe..15b1546 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.inject.Inject; import com.google.inject.Singleton; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.data.BibDataSet; @@ -333,7 +334,7 @@ public static Response processDatasetJATS(final InputStream inputStream, json.append(", \"md5\": \"" + md5Str + "\""); json.append(", \"mentions\":["); - if (extractedEntities != null && extractedEntities.size()>0) { + if (CollectionUtils.isNotEmpty(extractedEntities)) { boolean startList = true; for(List results : extractedEntities) { for(Dataset dataset : results) { @@ -348,12 +349,12 @@ public static Response processDatasetJATS(final InputStream inputStream, json.append("], \"references\":["); -// if (extractionResult != null) { -// List bibDataSet = extractionResult.getRight(); -// if (bibDataSet != null && bibDataSet.size()>0) { -// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); -// } -// } + if (CollectionUtils.isNotEmpty(extractedEntities)) { + List bibDataSet = extractionResult.getRight(); + if (CollectionUtils.isNotEmpty(bibDataSet)) { + DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); + } + } json.append("]"); @@ -437,7 +438,7 @@ public static Response processDatasetTEI(final InputStream inputStream, String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); json.append(", \"md5\": \"" + md5Str + "\""); json.append(", \"mentions\":["); - if (extractedEntities != null && extractedEntities.size()>0) { + if (CollectionUtils.isNotEmpty(extractedEntities)) { boolean startList = true; for(List results : extractedEntities) { for(Dataset dataset : results) { @@ -449,14 +450,15 @@ public static Response processDatasetTEI(final InputStream inputStream, } } } - json.append("], \"references\":[]"); - -// if (extractionResult != null) { -// List bibDataSet = extractionResult.getRight(); -// if (bibDataSet != null && bibDataSet.size()>0) { -// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); -// } -// } + json.append("], \"references\":["); + + if (CollectionUtils.isNotEmpty(extractedEntities)) { + List bibDataSet = extractionResult.getRight(); + if (CollectionUtils.isNotEmpty(bibDataSet)) { + DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); + } + } + json.append("]"); float runtime = ((float)(end-start)/1000); json.append(", \"runtime\": "+ runtime);