Skip to content

Commit

Permalink
fix references extraction
Browse files Browse the repository at this point in the history
(cherry picked from commit 27194da)
  • Loading branch information
lfoppiano committed Oct 14, 2024
1 parent 1483aab commit 4ab67a6
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 65 deletions.
2 changes: 1 addition & 1 deletion src/main/java/org/grobid/core/data/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ public void setBibRefs(List<BiblioComponent> bibRefs) {

public void addBibRef(BiblioComponent bibRef) {
if (bibRefs == null) {
bibRefs = new ArrayList<BiblioComponent>();
bibRefs = new ArrayList<>();
}
bibRefs.add(bibRef);
}
Expand Down
121 changes: 73 additions & 48 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -1690,8 +1690,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
localSequence.setRelevantSectionsImplicitDatasets(true);
selectedSequences.add(localSequence);

// Capture URLs if available

// Capture URLs and references if available
Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight();
localSequence.setReferences(referencesInText);
}
Expand Down Expand Up @@ -1873,7 +1872,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do


try {
String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type-> "not(contains(@type, '"+type+"'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']";
String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type -> "not(contains(@type, '" + type + "'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']";
expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']";
org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression,
doc,
Expand Down Expand Up @@ -1981,6 +1980,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target);
if (referenceInformation != null) {
BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
refText = refText.replaceAll("[\\[\\], ]+", "");

biblioRefMap.put(refText, biblioItem);
BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", "")));
biblioComponent.setRawForm(refText);
Expand All @@ -1999,8 +2000,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do

List<LayoutToken> allDocumentTokens = new ArrayList<>();

int startingOffset = 0;
List<Integer> sentenceOffsetStarts = new ArrayList<>();
for (DatasetDocumentSequence sequence : selectedSequences) {
List<LayoutToken> sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence.getText());
sequence.setTokens(sentenceTokens);
Expand Down Expand Up @@ -2028,34 +2027,21 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
// }
// });
//
int finalStartingOffset = startingOffset;
List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream()
.map(lt -> {
lt.setOffset(lt.getOffset() + finalStartingOffset);
return lt;
})
.collect(Collectors.toList());
// int finalStartingOffset = startingOffset;
// List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream()
// .map(lt -> {
// lt.setOffset(lt.getOffset() + finalStartingOffset);
// return lt;
// })
// .collect(Collectors.toList());

allDocumentTokens.addAll(sentenceTokenAllTokens);
sentenceOffsetStarts.add(startingOffset);
startingOffset += sequence.getText().length();
allDocumentTokens.addAll(sentenceTokens);
}

List<List<Dataset>> datasetLists = processing(selectedSequences, false);
List<List<Dataset>> datasetLists = processing(selectedSequences, disambiguate);

entities.addAll(datasetLists);

for (int i = 0; i < entities.size(); i++) {
List<Dataset> datasets = entities.get(i);
if (datasets == null) {
continue;
}
for (Dataset dataset : datasets) {
if (dataset == null)
continue;
dataset.setGlobalContextOffset(sentenceOffsetStarts.get(i));
}
}

// TODO make sure that selectedSequences == allSentences above in the processPDF?
List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList();
Expand Down Expand Up @@ -2101,7 +2087,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
termPattern,
placeTaken.get(i),
frequencies,
sentenceOffsetStarts.get(i)
0
// sentenceOffsetStarts.get(i)
);
if (localEntities != null) {
Collections.sort(localEntities);
Expand Down Expand Up @@ -2154,7 +2141,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
// Enhance information in dataset entities
if (CollectionUtils.isNotEmpty(bibRefComponents)) {
// attach references to dataset entities
entities = attachRefBib(entities, bibRefComponents);
entities = attachRefBibSimple(entities, bibRefComponents);
}

// consolidate the attached ref bib (we don't consolidate all bibliographical references
Expand All @@ -2168,7 +2155,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
for (BiblioComponent bibRef : bibRefs) {
Integer refKeyVal = bibRef.getRefKey();
if (!consolidated.contains(refKeyVal)) {
BiblioItem biblioItem = biblioRefMap.get(refKeyVal);
BiblioItem biblioItem = biblioRefMap.get(String.valueOf(refKeyVal));
BibDataSet biblioDataSet = new BibDataSet();
biblioDataSet.setResBib(biblioItem);
citationsToConsolidate.add(biblioDataSet);
Expand All @@ -2179,19 +2166,21 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
}
}

try {
Consolidation consolidator = Consolidation.getInstance();
Map<Integer, BiblioItem> resConsolidation = consolidator.consolidate(citationsToConsolidate);
for (int j = 0; j < citationsToConsolidate.size(); j++) {
BiblioItem resCitation = citationsToConsolidate.get(j).getResBib();
BiblioItem bibo = resConsolidation.get(j);
if (bibo != null) {
BiblioItem.correct(resCitation, bibo);
if (StringUtils.isNotBlank(datastetConfiguration.getGluttonHost())) {
try {
Consolidation consolidator = Consolidation.getInstance();
Map<Integer, BiblioItem> resConsolidation = consolidator.consolidate(citationsToConsolidate);
for (int j = 0; j < citationsToConsolidate.size(); j++) {
BiblioItem resCitation = citationsToConsolidate.get(j).getResBib();
BiblioItem bibo = resConsolidation.get(j);
if (bibo != null) {
BiblioItem.correct(resCitation, bibo);
}
}
} catch (Exception e) {
throw new GrobidException(
"An exception occurred while running consolidation on bibliographical references.", e);
}
} catch (Exception e) {
throw new GrobidException(
"An exception occured while running consolidation on bibliographical references.", e);
}

// propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref.
Expand Down Expand Up @@ -2230,8 +2219,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
entities = DatasetContextClassifier.getInstance(datastetConfiguration)
.classifyDocumentContexts(entities);

List<BibDataSet> resCitations = List.of();
return Pair.of(entities, resCitations);
return Pair.of(entities, citationsToConsolidate);
}

private static String normalize(String text) {
Expand Down Expand Up @@ -2355,10 +2343,11 @@ public static boolean checkDASAnnex(List<LayoutToken> annexTokens) {
return false;
}

/**
* Try to attach relevant bib ref component to dataset entities
*/
public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) {
return attachRefBib(entities, refBibComponents, 5);
}

public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) {

// we anchor the process to the dataset names and aggregate other closest components on the right
// if we cross a bib ref component we attach it, if a bib ref component is just after the last
Expand Down Expand Up @@ -2387,7 +2376,7 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli
for (BiblioComponent refBib : refBibComponents) {
//System.out.println(refBib.getOffsetStart() + " - " + refBib.getOffsetStart());
if ((refBib.getOffsetStart() >= pos) &&
(refBib.getOffsetStart() <= endPos + 5)) {
(refBib.getOffsetStart() <= endPos + distance)) {
entity.addBibRef(refBib);
endPos = refBib.getOffsetEnd();
}
Expand All @@ -2398,6 +2387,42 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli
return entities;
}

/**
* Try to attach relevant bib ref component to dataset entities, this does not use the global offset as in the
* TEI all references' offsets are local to the sentence
*/
public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) {
return attachRefBib(entities, refBibComponents, 5);
}

public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) {

// we anchor the process to the dataset names and aggregate other closest components on the right
// if we cross a bib ref component we attach it, if a bib ref component is just after the last
// component of the entity group, we attach it
for (List<Dataset> datasets : entities) {
for (Dataset entity : datasets) {
if (entity.getDatasetName() == null)
continue;

// find the name component and the offset
DatasetComponent nameComponent = entity.getDatasetName();
int pos = nameComponent.getOffsetEnd();

// find included or just next bib ref callout
List<BiblioComponent> relatedReferences = refBibComponents.stream()
.filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance)
.collect(Collectors.toList());

if (CollectionUtils.isNotEmpty(relatedReferences)) {
entity.setBibRefs(relatedReferences);
}
}
}

return entities;
}

public List<List<OffsetPosition>> preparePlaceTaken(List<List<Dataset>> entities) {
List<List<OffsetPosition>> localPositions = new ArrayList<>();
for (List<Dataset> datasets : entities) {
Expand Down Expand Up @@ -2690,7 +2715,7 @@ public List<Dataset> propagateLayoutTokenSequence(DatasetDocumentSequence sequen
entity.getSequenceIdentifiers().addAll(name.getSequenceIdentifiers());
//entity.setType(DatastetLexicon.Dataset_Type.DATASET);
entity.setPropagated(true);
entity.setGlobalContextOffset(sentenceOffsetStart);
// entity.setGlobalContextOffset(sentenceOffsetStart);
if (entities == null)
entities = new ArrayList<>();
entities.add(entity);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.data.BibDataSet;
Expand Down Expand Up @@ -338,7 +339,7 @@ public static Response processDatasetJATS(final InputStream inputStream,
json.append(", \"md5\": \"" + md5Str + "\"");
json.append(", \"mentions\":[");

if (extractedEntities != null && extractedEntities.size()>0) {
if (CollectionUtils.isNotEmpty(extractedEntities)) {
boolean startList = true;
for(List<Dataset> results : extractedEntities) {
for(Dataset dataset : results) {
Expand All @@ -353,12 +354,12 @@ public static Response processDatasetJATS(final InputStream inputStream,

json.append("], \"references\":[");

// if (extractionResult != null) {
// List<BibDataSet> bibDataSet = extractionResult.getRight();
// if (bibDataSet != null && bibDataSet.size()>0) {
// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities);
// }
// }
if (CollectionUtils.isNotEmpty(extractedEntities)) {
List<BibDataSet> bibDataSet = extractionResult.getRight();
if (CollectionUtils.isNotEmpty(bibDataSet)) {
DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities);
}
}

json.append("]");

Expand Down Expand Up @@ -442,7 +443,7 @@ public static Response processDatasetTEI(final InputStream inputStream,
String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase();
json.append(", \"md5\": \"" + md5Str + "\"");
json.append(", \"mentions\":[");
if (extractedEntities != null && extractedEntities.size()>0) {
if (CollectionUtils.isNotEmpty(extractedEntities)) {
boolean startList = true;
for(List<Dataset> results : extractedEntities) {
for(Dataset dataset : results) {
Expand All @@ -454,14 +455,15 @@ public static Response processDatasetTEI(final InputStream inputStream,
}
}
}
json.append("], \"references\":[]");

// if (extractionResult != null) {
// List<BibDataSet> bibDataSet = extractionResult.getRight();
// if (bibDataSet != null && bibDataSet.size()>0) {
// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities);
// }
// }
json.append("], \"references\":[");

if (CollectionUtils.isNotEmpty(extractedEntities)) {
List<BibDataSet> bibDataSet = extractionResult.getRight();
if (CollectionUtils.isNotEmpty(bibDataSet)) {
DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities);
}
}
json.append("]");

float runtime = ((float)(end-start)/1000);
json.append(", \"runtime\": "+ runtime);
Expand Down

0 comments on commit 4ab67a6

Please sign in to comment.