From 5e8ecfdb86978eb2cdc7f20c40393fe52f8eafed Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 15:36:25 +0900 Subject: [PATCH 01/46] make gradle build and add github actions --- .github/workflows/ci-build-manual.yml | 42 +++++++++++ .github/workflows/ci-build.yml | 65 ++++++++++++++++ .github/workflows/ci-integration-manual.yml | 32 ++++++++ .github/workflows/ci-release.yml | 74 +++++++++++++++++++ .../core/engines/DataseerClassifierTest.java | 31 +++----- .../core/engines/DatasetParserTest.java | 32 +++----- ...ava => DatasetLexiconIntegrationTest.java} | 72 +++++++----------- 7 files changed, 263 insertions(+), 85 deletions(-) create mode 100644 .github/workflows/ci-build-manual.yml create mode 100644 .github/workflows/ci-build.yml create mode 100644 .github/workflows/ci-integration-manual.yml create mode 100644 .github/workflows/ci-release.yml rename src/test/java/org/grobid/core/lexicon/{DatasetLexiconTest.java => DatasetLexiconIntegrationTest.java} (63%) diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml new file mode 100644 index 0000000..7f596c1 --- /dev/null +++ b/.github/workflows/ci-build-manual.yml @@ -0,0 +1,42 @@ +name: Build and push a development version on docker + +on: + workflow_dispatch: + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v2 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.local + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: true + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100644 index 0000000..dd107a3 --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,65 @@ +name: Build unstable + +on: [push] + +concurrency: + group: gradle +# cancel-in-progress: true + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + - name: Test with Gradle Jacoco and Coveralls + run: ./gradlew test jacocoTestReport coveralls --no-daemon + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + format: jacoco + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + # - name: Maximize build disk space + # uses: easimon/maximize-build-space@v10 + # with: + # root-reserve-mb: 512 + # swap-size-mb: 1024 + # remove-dotnet: 'true' + - name: Create more disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /opt/hostedtoolcache + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.local + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-integration-manual.yml b/.github/workflows/ci-integration-manual.yml new file mode 100644 index 0000000..4de0417 --- /dev/null +++ b/.github/workflows/ci-integration-manual.yml @@ -0,0 +1,32 @@ +name: Run integration tests manually + +on: + push: + branches: + - master + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout grobid home + uses: actions/checkout@v4 + with: + repository: kermitt2/grobid + path: ./grobid + - name: Checkout Datastet + uses: actions/checkout@v4 + with: + path: ./grobid/datastet + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build and run integration tests + working-directory: ./grobid/datastet + run: ./gradlew copyModels integration --no-daemon + diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml new file mode 100644 index 0000000..19e59d1 --- /dev/null +++ b/.github/workflows/ci-release.yml @@ -0,0 +1,74 @@ +name: Build release + +on: + workflow_dispatch: + push: + tags: + - 'v*' + +concurrency: + group: docker + cancel-in-progress: true + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + - name: Test with Gradle Jacoco and Coveralls + run: ./gradlew test jacocoTestReport coveralls --no-daemon + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + format: jacoco + + + docker-build: + needs: [build] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Set tags + id: set_tags + run: | + DOCKER_IMAGE=lfoppiano/datastet + VERSION="" + if [[ $GITHUB_REF == refs/tags/v* ]]; then + VERSION=${GITHUB_REF#refs/tags/v} + fi + if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + TAGS="${VERSION}" + else + TAGS="latest" + fi + echo "TAGS=${TAGS}" + echo ::set-output name=tags::${TAGS} + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.local + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.set_tags.outputs.tags }} + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java b/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java index 115b8d6..8940913 100644 --- a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java +++ b/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java @@ -1,49 +1,40 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.IOUtils; -import org.grobid.core.document.Document; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; +import org.grobid.core.utilities.DatastetConfiguration; +import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.GrobidProperties; import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Test; import org.junit.Ignore; +import org.junit.Test; import java.io.File; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Arrays; import java.util.ArrayList; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertNotNull; +import java.util.Arrays; +import java.util.List; /** * @author Patrice */ @Ignore public class DataseerClassifierTest { - private static DataseerConfiguration configuration; + private static DatastetConfiguration configuration; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); File yamlFile = new File("resources/config/dataseer-ml.yml").getAbsoluteFile(); yamlFile = new File(yamlFile.getAbsolutePath()); - dataseerConfiguration = mapper.readValue(yamlFile, DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(yamlFile, DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); diff --git a/src/test/java/org/grobid/core/engines/DatasetParserTest.java b/src/test/java/org/grobid/core/engines/DatasetParserTest.java index 8864128..7f4ecb7 100644 --- a/src/test/java/org/grobid/core/engines/DatasetParserTest.java +++ b/src/test/java/org/grobid/core/engines/DatasetParserTest.java @@ -1,50 +1,42 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.IOUtils; -import org.grobid.core.document.Document; import org.grobid.core.data.Dataset; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; +import org.grobid.core.utilities.DatastetConfiguration; +import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.GrobidProperties; +import org.grobid.service.configuration.DatastetServiceConfiguration; import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Test; import org.junit.Ignore; +import org.junit.Test; import java.io.File; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Arrays; import java.util.ArrayList; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertNotNull; +import java.util.Arrays; +import java.util.List; /** * @author Patrice */ @Ignore public class DatasetParserTest { - private static DataseerConfiguration configuration; + private static DatastetConfiguration configuration; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); File yamlFile = new File("resources/config/dataseer-ml.yml").getAbsoluteFile(); yamlFile = new File(yamlFile.getAbsolutePath()); - dataseerConfiguration = mapper.readValue(yamlFile, DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(yamlFile, DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); diff --git a/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java b/src/test/java/org/grobid/core/lexicon/DatasetLexiconIntegrationTest.java similarity index 63% rename from src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java rename to src/test/java/org/grobid/core/lexicon/DatasetLexiconIntegrationTest.java index 7e5c736..2f770da 100644 --- a/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java +++ b/src/test/java/org/grobid/core/lexicon/DatasetLexiconIntegrationTest.java @@ -1,64 +1,46 @@ package org.grobid.core.lexicon; -import org.apache.commons.io.IOUtils; -import org.grobid.core.analyzers.DataseerAnalyzer; -import org.grobid.core.data.Dataset; -import org.grobid.core.document.Document; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; -import org.grobid.core.utilities.OffsetPosition; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.grobid.core.main.GrobidHomeFinder; +import org.grobid.core.utilities.DatastetConfiguration; import org.grobid.core.utilities.GrobidConfig.ModelParameters; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.utilities.Pair; -import org.junit.Before; +import org.grobid.core.utilities.GrobidProperties; import org.junit.BeforeClass; import org.junit.Test; import java.io.File; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; -import static org.junit.Assert.assertNotNull; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; /** * @author Patrice */ -public class DatasetLexiconTest { - private static DataseerLexicon dataseerLexicon; +public class DatasetLexiconIntegrationTest { + private static DatastetLexicon target; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); - dataseerConfiguration = mapper.readValue(new File("resources/config/dataseer-ml.yml").getAbsoluteFile(), DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(new File("resources/config/config.yml").getAbsoluteFile(), DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); GrobidHomeFinder grobidHomeFinder = new GrobidHomeFinder(Arrays.asList(pGrobidHome)); GrobidProperties.getInstance(grobidHomeFinder); - - System.out.println(">>>>>>>> GROBID_HOME="+GrobidProperties.get_GROBID_HOME_PATH()); + + System.out.println(">>>>>>>> GROBID_HOME=" + GrobidProperties.get_GROBID_HOME_PATH()); if (dataseerConfiguration != null && dataseerConfiguration.getModel() != null) { for (ModelParameters model : dataseerConfiguration.getModels()) GrobidProperties.getInstance().addModel(model); } //LibraryLoader.load(); - - dataseerLexicon = DataseerLexicon.getInstance(); - + target = target.getInstance(); } catch (final Exception exp) { System.err.println("GROBID dataset initialisation failed: " + exp); exp.printStackTrace(); @@ -70,10 +52,10 @@ public void testDatasetDOISuccess() throws Exception { String testStringZenodo = "10.5281/zenodo.5769577"; String testStringDryad = "https://doi.org/10.5061/DRYAD.0SN63/7"; String testStringFigshare = "https://doi.org/10.6084/m9.figshare.10275182"; - - boolean zenodoCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringZenodo); - boolean dryadCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringDryad); - boolean figshareCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringFigshare); + + boolean zenodoCheck = target.getInstance().isDatasetDOI(testStringZenodo); + boolean dryadCheck = target.getInstance().isDatasetDOI(testStringDryad); + boolean figshareCheck = target.getInstance().isDatasetDOI(testStringFigshare); assertThat(zenodoCheck, is(true)); assertThat(dryadCheck, is(true)); @@ -85,10 +67,10 @@ public void testDatasetDOIFail() throws Exception { String testStringFirst = "https://doi.org/10.1038/s41523-019-0142-6"; String testStringSecond = "https://doi.org/10.1371/journal.pone.0263302"; String testStringThird = "https://doi.org/10.1186/s13064-019-0127-z"; - - boolean firstCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringFirst); - boolean secondCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringSecond); - boolean thirdCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringThird); + + boolean firstCheck = target.isDatasetDOI(testStringFirst); + boolean secondCheck = target.isDatasetDOI(testStringSecond); + boolean thirdCheck = target.isDatasetDOI(testStringThird); assertThat(firstCheck, is(false)); assertThat(secondCheck, is(false)); @@ -101,9 +83,9 @@ public void testDatasetUrlSuccess() throws Exception { String testStringGithub = "https://github.com/leonfodoulian/SARS_CoV_2_anosmia"; String testStringOsf = "https://osf.io/5r72u"; - boolean idCheck = DataseerLexicon.getInstance().isDatasetURL(testStringId); - boolean githubCheck = DataseerLexicon.getInstance().isDatasetURL(testStringGithub); - boolean osfCheck = DataseerLexicon.getInstance().isDatasetURL(testStringOsf); + boolean idCheck = target.isDatasetURL(testStringId); + boolean githubCheck = target.isDatasetURL(testStringGithub); + boolean osfCheck = target.isDatasetURL(testStringOsf); assertThat(idCheck, is(true)); assertThat(githubCheck, is(true)); @@ -116,9 +98,9 @@ public void testDatasetUrlFail() throws Exception { String testStringSecond = "https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/LemmatizerModel.html"; String testStringThird = "https://stackoverflow.com/questions/11976393/get-github-username-by-id"; - boolean firstCheck = DataseerLexicon.getInstance().isDatasetURL(testStringFirst); - boolean secondCheck = DataseerLexicon.getInstance().isDatasetURL(testStringSecond); - boolean thirdCheck = DataseerLexicon.getInstance().isDatasetURL(testStringThird); + boolean firstCheck = target.isDatasetURL(testStringFirst); + boolean secondCheck = target.isDatasetURL(testStringSecond); + boolean thirdCheck = target.isDatasetURL(testStringThird); assertThat(firstCheck, is(false)); assertThat(secondCheck, is(false)); @@ -130,8 +112,8 @@ public void testLeadingStopwords() throws Exception { String testStringFirst = "and the dataset TOTO"; String testStringSecond = "and the dataset TOTO of"; - String firstCheck = DataseerLexicon.getInstance().removeLeadingEnglishStopwords(testStringFirst); - String secondCheck = DataseerLexicon.getInstance().removeLeadingEnglishStopwords(testStringSecond); + String firstCheck = target.removeLeadingEnglishStopwords(testStringFirst); + String secondCheck = target.removeLeadingEnglishStopwords(testStringSecond); assertThat(firstCheck, is("dataset TOTO")); assertThat(secondCheck, is("dataset TOTO of")); From d545b5d4b493c90ed5a327f8f364644af4961541 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 16:14:24 +0900 Subject: [PATCH 02/46] read grobid-home from configuration --- build.gradle | 92 ++++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/build.gradle b/build.gradle index 15232c4..f38c3f6 100644 --- a/build.gradle +++ b/build.gradle @@ -7,6 +7,7 @@ buildscript { dependencies { classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0' classpath "gradle.plugin.com.github.jengelman.gradle.plugins:shadow:7.0.0" + classpath group: 'org.yaml', name: 'snakeyaml', version: '1.19' } } @@ -30,8 +31,8 @@ version = '0.8.0' description = """datastet""" -sourceCompatibility = 1.11 -targetCompatibility = 1.11 +sourceCompatibility = 1.17 +targetCompatibility = 1.17 import org.apache.tools.ant.taskdefs.condition.Os @@ -67,12 +68,12 @@ dependencies { //Apache commons implementation group: 'commons-pool', name: 'commons-pool', version: '1.6' - implementation group: 'commons-io', name: 'commons-io', version: '2.5' + implementation group: 'commons-io', name: 'commons-io', version: '2.9.0' //implementation group: 'commons-logging', name: 'commons-logging', version: '1.2' - implementation group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.3' + implementation group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.14' implementation group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.3' implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.6' - implementation group: 'org.apache.commons', name: 'commons-collections4', version: '4.1' + implementation group: 'org.apache.commons', name: 'commons-collections4', version: '4.4' implementation group: 'org.apache.commons', name: 'commons-csv', version: '1.5' implementation group: 'com.google.guava', name: 'guava', version: '28.2-jre' @@ -127,11 +128,11 @@ dependencies { testImplementation group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' } -configurations.all { +configurations.all { resolutionStrategy { force 'xml-apis:xml-apis:1.4.01' } - + exclude group: 'org.slf4j', module: "slf4j-log4j12" exclude group: 'org.slf4j', module: "slf4j-jdk14" exclude group: 'log4j', module: "log4j" @@ -143,14 +144,14 @@ configurations.all { def libraries = "" if (Os.isFamily(Os.FAMILY_MAC)) { if (Os.OS_ARCH.equals("aarch64")) { - libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}" + libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}" } else { libraries = "${file("../grobid-home/lib/mac-64").absolutePath}" } } else if (Os.isFamily(Os.FAMILY_UNIX)) { libraries = "${file("../grobid-home/lib/lin-64/jep").absolutePath}:" + - "${file("../grobid-home/lib/lin-64").absolutePath}:" -} else { + "${file("../grobid-home/lib/lin-64").absolutePath}:" +} else { throw new RuntimeException("Unsupported platform!") } @@ -161,10 +162,10 @@ test { exclude '**/**IntegrationTest**' if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", - "--add-opens", "java.base/java.io=ALL-UNNAMED" + jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", + "--add-opens", "java.base/java.io=ALL-UNNAMED" } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // return the default value if the property has not been specified in command line @@ -225,11 +226,11 @@ jar { enabled true } -tasks.withType(Tar){ +tasks.withType(Tar) { duplicatesStrategy = DuplicatesStrategy.EXCLUDE } -tasks.withType(Zip){ +tasks.withType(Zip) { duplicatesStrategy = DuplicatesStrategy.EXCLUDE } @@ -238,67 +239,74 @@ artifacts { archives shadowJar } +def conf = new org.yaml.snakeyaml.Yaml().load(new File("resources/config/config.yml").newInputStream()) +def grobidHome = conf.grobidHome.replace("\$", "").replace('{', "").replace("GROBID_HOME:- ", "").replace("}", "") +if (grobidHome.startsWith("../")) { + grobidHome = "${rootProject.rootDir}/${grobidHome}" +} + task installModels(type: Copy) { from "${rootDir}/resources/models/" include "**" - into "${rootDir}/../grobid-home/models/" + into "${grobidHome}/models/" doLast { + print "Copy models under grobid-home: ${grobidHome}" download { src "https://grobid.s3.amazonaws.com/dataseer-binary_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-binary_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") - + ant.unzip(src: "${grobidHome}/models/dataseer-binary_bert-0.3.1.zip", dest: "${grobidHome}/models/") + download { src "https://grobid.s3.amazonaws.com/dataseer-first_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-first_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") - + ant.unzip(src: "${grobidHome}/models/dataseer-first_bert-0.3.1.zip", dest: "${grobidHome}/models/") + download { src "https://grobid.s3.amazonaws.com/dataseer-reuse_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-reuse_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") + ant.unzip(src: "${grobidHome}/models/dataseer-reuse_bert-0.3.1.zip", dest: "${grobidHome}/models/") download { src "https://grobid.s3.amazonaws.com/datasets-BERT_CRF-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/datasets-BERT_CRF-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") + ant.unzip(src: "${grobidHome}/models/datasets-BERT_CRF-0.3.2.zip", dest: "${grobidHome}/models/") download { src "https://grobid.s3.amazonaws.com/context_bert-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - + ant.unzip(src: "${grobidHome}/models/context_bert-0.3.2.zip", dest: "${grobidHome}/models/") + download { src "https://grobid.s3.amazonaws.com/context_bert_used-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_used-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - + ant.unzip(src: "${grobidHome}/models/context_bert_used-0.3.2.zip", dest: "${grobidHome}/models/") + download { src "https://grobid.s3.amazonaws.com/context_bert_creation-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_creation-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - + ant.unzip(src: "${grobidHome}/models/context_bert_creation-0.3.2.zip", dest: "${grobidHome}/models/") + download { src "https://grobid.s3.amazonaws.com/context_bert_shared-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" + dest "${grobidHome}/models/" overwrite false } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_shared-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") + ant.unzip(src: "${grobidHome}/models/context_bert_shared-0.3.2.zip", dest: "${grobidHome}/models/") } } @@ -323,7 +331,7 @@ task(train_dataseer, dependsOn: 'classes', type: JavaExec, group: 'training') { } else { jvmArgs '-Xmx3072m' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries // jvmArgs '-Xms2g', '-Xmx8g' } @@ -345,7 +353,7 @@ task(eval_dataseer, dependsOn: 'classes', type: JavaExec, group: 'evaluation') { } else { jvmArgs '-Xmx3072m' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries // jvmArgs '-Xms2g', '-Xmx8g' } @@ -360,7 +368,7 @@ task(eval_dataseer_split, dependsOn: 'classes', type: JavaExec, group: 'evaluati } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // Run like this: ./gradlew eval_dataseer_nfold -PgH=/path/grobid/home -Pt=10 @@ -373,7 +381,7 @@ task(eval_dataseer_nfold, dependsOn: 'classes', type: JavaExec, group: 'evaluati } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // Run like this: ./gradlew annotated_corpus_generator_csv -Pfull=/path/input/fulltext -Ppdf=/path/input/pdf -Pcsv=/path/csv -Pxml=/output/directory @@ -386,7 +394,7 @@ task(annotated_corpus_generator_csv, dependsOn: 'classes', type: JavaExec, group } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } ////////// From 33648de4f05f4880c5d2aff2446f7b1db4b2260b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 16:25:59 +0900 Subject: [PATCH 03/46] disable superfluous tests --- .github/workflows/ci-integration-manual.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-integration-manual.yml b/.github/workflows/ci-integration-manual.yml index 4de0417..c507534 100644 --- a/.github/workflows/ci-integration-manual.yml +++ b/.github/workflows/ci-integration-manual.yml @@ -1,9 +1,9 @@ name: Run integration tests manually on: - push: - branches: - - master +# push: +# branches: +# - master workflow_dispatch: jobs: From 49a07b6df1a42c6ee1e026e9c1f986206d6b6080 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 16:34:22 +0900 Subject: [PATCH 04/46] fix build --- build.gradle | 18 +++++++++++++----- resources/config/config.yml | 2 +- ... => DataseerClassifierIntegrationTest.java} | 2 +- ....java => DatasetParserIntegrationTest.java} | 3 +-- 4 files changed, 16 insertions(+), 9 deletions(-) rename src/test/java/org/grobid/core/engines/{DataseerClassifierTest.java => DataseerClassifierIntegrationTest.java} (98%) rename src/test/java/org/grobid/core/engines/{DatasetParserTest.java => DatasetParserIntegrationTest.java} (96%) diff --git a/build.gradle b/build.gradle index f38c3f6..111c8cd 100644 --- a/build.gradle +++ b/build.gradle @@ -14,13 +14,9 @@ buildscript { plugins { id 'com.github.johnrengelman.shadow' version '7.0.0' id "de.undercouch.download" version "4.1.1" + id "jacoco" } -apply plugin: 'jacoco' - -jacoco { - toolVersion = '0.8.8' -} apply plugin: 'java-library' apply plugin: 'base' @@ -414,3 +410,15 @@ application { args = ['server', 'resources/config/config.yml'] } } + +jacocoTestReport { + reports { + xml.enabled = true // coveralls plugin depends on xml format report + html.enabled = true + } + dependsOn test // tests are required to run before generating the report +} + +coveralls { + jacocoReportPath 'build/reports/jacoco/test/jacocoTestReport.xml' +} diff --git a/resources/config/config.yml b/resources/config/config.yml index 917e307..7569db5 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -2,7 +2,7 @@ version: "0.8.0" corpusPath: "./resources/dataset/dataseer/corpus" templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template" -grobidHome: "../grobid-home" +grobidHome: "../../grobid/grobid-home" tmpPath: "tmp/" # path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI diff --git a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java b/src/test/java/org/grobid/core/engines/DataseerClassifierIntegrationTest.java similarity index 98% rename from src/test/java/org/grobid/core/engines/DataseerClassifierTest.java rename to src/test/java/org/grobid/core/engines/DataseerClassifierIntegrationTest.java index 8940913..2b09804 100644 --- a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java +++ b/src/test/java/org/grobid/core/engines/DataseerClassifierIntegrationTest.java @@ -23,7 +23,7 @@ * @author Patrice */ @Ignore -public class DataseerClassifierTest { +public class DataseerClassifierIntegrationTest { private static DatastetConfiguration configuration; @BeforeClass diff --git a/src/test/java/org/grobid/core/engines/DatasetParserTest.java b/src/test/java/org/grobid/core/engines/DatasetParserIntegrationTest.java similarity index 96% rename from src/test/java/org/grobid/core/engines/DatasetParserTest.java rename to src/test/java/org/grobid/core/engines/DatasetParserIntegrationTest.java index 7f4ecb7..5aeecd9 100644 --- a/src/test/java/org/grobid/core/engines/DatasetParserTest.java +++ b/src/test/java/org/grobid/core/engines/DatasetParserIntegrationTest.java @@ -9,7 +9,6 @@ import org.grobid.core.utilities.DatastetConfiguration; import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.utilities.GrobidProperties; -import org.grobid.service.configuration.DatastetServiceConfiguration; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Ignore; @@ -25,7 +24,7 @@ * @author Patrice */ @Ignore -public class DatasetParserTest { +public class DatasetParserIntegrationTest { private static DatastetConfiguration configuration; @BeforeClass From 5d2872e87149a2da00d165a65c86cc05b16dd574 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 16:55:08 +0900 Subject: [PATCH 05/46] add simple test on analyzer to get started --- build.gradle | 2 + .../grobid/core/engines/DataseerParser.java | 47 ++---------- .../core/analyzers/DatastetAnalyzerTest.java | 76 +++++++++++++++++++ 3 files changed, 86 insertions(+), 39 deletions(-) create mode 100644 src/test/java/org/grobid/core/analyzers/DatastetAnalyzerTest.java diff --git a/build.gradle b/build.gradle index 111c8cd..c2556de 100644 --- a/build.gradle +++ b/build.gradle @@ -122,6 +122,8 @@ dependencies { //Tests testImplementation group: 'junit', name: 'junit', version: '4.12' testImplementation group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' + testImplementation 'org.powermock:powermock-module-junit4:2.0.9' + testImplementation 'org.powermock:powermock-api-easymock:2.0.9' } configurations.all { diff --git a/src/main/java/org/grobid/core/engines/DataseerParser.java b/src/main/java/org/grobid/core/engines/DataseerParser.java index 7e15eca..fe35c05 100644 --- a/src/main/java/org/grobid/core/engines/DataseerParser.java +++ b/src/main/java/org/grobid/core/engines/DataseerParser.java @@ -1,53 +1,22 @@ package org.grobid.core.engines; -import org.apache.commons.io.FileUtils; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.DatastetAnalyzer; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.data.BibDataSet; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.TEIFormatter; -import org.grobid.core.document.xml.XmlBuilderUtils; -import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.SegmentationLabels; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.engines.tagging.GrobidCRFEngine; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorDataseer; import org.grobid.core.features.FeatureFactory; -import org.grobid.core.layout.BoundingBox; +import org.grobid.core.features.FeaturesVectorDataseer; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.lexicon.DatastetLexicon; -import org.grobid.core.tokenization.TaggingTokenCluster; -import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.*; -import org.grobid.core.utilities.counters.CntManager; +import org.grobid.core.utilities.DatastetUtilities; +import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.counters.impl.CntManagerFactory; -import org.grobid.core.lexicon.FastMatcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; - -import nu.xom.Attribute; -import nu.xom.Element; -import nu.xom.Node; -import nu.xom.Text; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; -import org.apache.commons.lang3.tuple.Pair; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.commons.lang3.StringUtils.isNotEmpty; +import static org.apache.commons.lang3.StringUtils.trim; /** * Identification of the article sections introducing datasets. diff --git a/src/test/java/org/grobid/core/analyzers/DatastetAnalyzerTest.java b/src/test/java/org/grobid/core/analyzers/DatastetAnalyzerTest.java new file mode 100644 index 0000000..c3122a6 --- /dev/null +++ b/src/test/java/org/grobid/core/analyzers/DatastetAnalyzerTest.java @@ -0,0 +1,76 @@ +package org.grobid.core.analyzers; + +import junit.framework.TestCase; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.lessThan; +import static org.junit.Assert.assertThat; + +public class DatastetAnalyzerTest extends TestCase { + + private DatastetAnalyzer target; + @Before + public void setUp() throws Exception { + target = DatastetAnalyzer.getInstance(); + } + + @Test + public void testTokenize_plainText() throws Exception { + List tokens = target.tokenize("This is a sample text, with 1.5m of intelligence."); + + assertThat(tokens, hasSize(22)); + assertThat(tokens.get(0), is("This")); + assertThat(tokens.get(21), is(".")); + } + + @Test + public void testTokenizeWithLayoutToken_testOffsetConsistency() throws Exception { + List tokens = target.tokenizeWithLayoutToken("This is a sample text, with 1.5m of intelligence."); + List reTokens = target.retokenizeLayoutTokens(tokens); + + assertThat(tokens, hasSize(reTokens.size())); + + for (int i = 0; i < reTokens.size(); i++) { + assertThat(tokens.get(i).getOffset(), is(reTokens.get(i).getOffset())); + } + } + + @Test + public void testTokenizeWithLayoutToken_checkOffsets() throws Exception { + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken("This is a sample text, with 1.5m of intelligence."); + List reTokens = target.retokenizeLayoutTokens(tokens); + + assertThat(tokens.size(), lessThan(reTokens.size())); + } + + @Test + public void testTokenize_3() throws Exception { + String inputText = "This is a sample text, with 1.5m of intelligence."; + List tokens = target.tokenizeWithLayoutToken(inputText); + + assertThat(tokens, hasSize(22)); + assertThat(LayoutTokensUtil.toText(tokens), is(inputText)); + assertThat(tokens.get(0).getText(), is("This")); + assertThat(tokens.get(0).getOffset(), is(0)); + assertThat(tokens.get(15).getText(), is("5")); + assertThat(tokens.get(15).getOffset(), is(30)); + } + + + @Test + public void testTokenize_with_unicode_characters() throws Exception { + String input = "La2\u2212xSrxCuO4 (LSCO)"; + + List tokenize = target.tokenize(input); + + assertThat(tokenize, hasSize(9)); + } + +} \ No newline at end of file From 8bc2987d55576172e986556e834131bb462155a9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 17:04:15 +0900 Subject: [PATCH 06/46] enable jacoco report --- gradle.properties | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 gradle.properties diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..dd48123 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,3 @@ +org.gradle.caching=false +org.gradle.daemon=false +org.gradle.jvmargs= --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-exports=jdk.unsupported/sun.misc=ALL-UNNAMED \ No newline at end of file From fd84d8813421bbd884971f41e8845acaa3b6709b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 17:12:57 +0900 Subject: [PATCH 07/46] fix build docker --- .github/workflows/ci-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index dd107a3..6463cc9 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -54,7 +54,7 @@ jobs: id: docker_build uses: mr-smithers-excellent/docker-build-push@v6 with: - dockerfile: Dockerfile.local + dockerfile: Dockerfile.datastet username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} image: lfoppiano/datastet From ffb5bead9fc4e295382109e097576bc8f37d3777 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Mar 2024 17:19:50 +0900 Subject: [PATCH 08/46] disable docker build for the moment --- .github/workflows/ci-build.yml | 64 +++++++++++++++++----------------- Dockerfile.datastet | 2 +- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 6463cc9..0592034 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -31,35 +31,35 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} format: jacoco - docker-build: - needs: [ build ] - runs-on: ubuntu-latest - - steps: - # - name: Maximize build disk space - # uses: easimon/maximize-build-space@v10 - # with: - # root-reserve-mb: 512 - # swap-size-mb: 1024 - # remove-dotnet: 'true' - - name: Create more disk space - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf /opt/hostedtoolcache - - uses: actions/checkout@v4 - - name: Build and push - id: docker_build - uses: mr-smithers-excellent/docker-build-push@v6 - with: - dockerfile: Dockerfile.datastet - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - image: lfoppiano/datastet - registry: docker.io - pushImage: ${{ github.event_name != 'pull_request' }} - tags: latest-develop - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} +# docker-build: +# needs: [ build ] +# runs-on: ubuntu-latest +# +# steps: +# # - name: Maximize build disk space +# # uses: easimon/maximize-build-space@v10 +# # with: +# # root-reserve-mb: 512 +# # swap-size-mb: 1024 +# # remove-dotnet: 'true' +# - name: Create more disk space +# run: | +# sudo rm -rf /usr/share/dotnet +# sudo rm -rf /opt/ghc +# sudo rm -rf "/usr/local/share/boost" +# sudo rm -rf "$AGENT_TOOLSDIRECTORY" +# sudo rm -rf /opt/hostedtoolcache +# - uses: actions/checkout@v4 +# - name: Build and push +# id: docker_build +# uses: mr-smithers-excellent/docker-build-push@v6 +# with: +# dockerfile: Dockerfile.datastet +# username: ${{ secrets.DOCKERHUB_USERNAME }} +# password: ${{ secrets.DOCKERHUB_TOKEN }} +# image: lfoppiano/datastet +# registry: docker.io +# pushImage: ${{ github.event_name != 'pull_request' }} +# tags: latest-develop +# - name: Image digest +# run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Dockerfile.datastet b/Dockerfile.datastet index 7ec2ada..ed17bbf 100644 --- a/Dockerfile.datastet +++ b/Dockerfile.datastet @@ -25,7 +25,7 @@ COPY grobid-home/ ./grobid-home/ COPY grobid-core/ ./grobid-core/ #COPY grobid-service/ ./grobid-service/ COPY grobid-trainer/ ./grobid-trainer/ -COPY datastet/ ./datastet/ +#COPY datastet/ ./datastet/ # cleaning unused native libraries before packaging RUN rm -rf grobid-home/pdf2xml From bb48f37d5cabf24735285704924363e9b21dc670 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 18 Apr 2024 09:02:12 +0800 Subject: [PATCH 09/46] add parameter to enable/disable sentence segmentation for TEI processing --- .../service/controller/DatastetController.java | 8 +++++--- .../service/controller/DatastetProcessFile.java | 13 +++++++++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/grobid/service/controller/DatastetController.java b/src/main/java/org/grobid/service/controller/DatastetController.java index 587c15d..850539e 100644 --- a/src/main/java/org/grobid/service/controller/DatastetController.java +++ b/src/main/java/org/grobid/service/controller/DatastetController.java @@ -80,7 +80,7 @@ public Response processDatasetText_post(@FormParam(TEXT) String text) { LOGGER.info(text); return DatastetProcessString.processDatsetSentence(text); } - + @Path(PATH_DATASET_SENTENCE) @Produces(MediaType.APPLICATION_JSON + ";charset=utf-8") @GET @@ -111,8 +111,10 @@ public Response processDatasetPDF(@FormDataParam(INPUT) InputStream inputStream, @Consumes(MediaType.MULTIPART_FORM_DATA) @Produces(MediaType.APPLICATION_XML) @POST - public Response processTEI(@FormDataParam(INPUT) InputStream inputStream) { - return DatastetProcessFile.processTEI(inputStream); + public Response processTEI( + @FormDataParam(INPUT) InputStream inputStream, + @FormDataParam("segmentSentences") String segmentSentences) { + return DatastetProcessFile.processTEI(inputStream, segmentSentences); } @Path(PATH_DATASEER_JATS) diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index dd4540d..3cc83d8 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -69,8 +69,9 @@ public DatastetProcessFile() { * @param inputStream the data of origin TEI document * @return a response object which contains an enriched TEI representation of the document */ - public static Response processTEI(final InputStream inputStream) { + public static Response processTEI(final InputStream inputStream, String segmentSentences) { LOGGER.debug(methodLogIn()); + boolean segmentSentencesBool = validateTrueFalseParam(segmentSentences); String retVal = null; Response response = null; File originFile = null; @@ -84,7 +85,7 @@ public static Response processTEI(final InputStream inputStream) { } // starts conversion process - retVal = classifier.processTEI(originFile.getAbsolutePath(), true, false); + retVal = classifier.processTEI(originFile.getAbsolutePath(), segmentSentencesBool, false); if (!isResultOK(retVal)) { response = Response.status(Response.Status.NO_CONTENT).build(); @@ -516,6 +517,14 @@ public static String methodLogOut() { return "<< " + DatastetProcessFile.class.getName() + "." + Thread.currentThread().getStackTrace()[1].getMethodName(); } + private static boolean validateTrueFalseParam(String param) { + boolean booleanOutput = false; + if ((param != null) && (param.equals("1") || param.equalsIgnoreCase("true"))) { + booleanOutput = true; + } + return booleanOutput; + } + /** * Check whether the result is null or empty. */ From f05f68bfb99686ab46bb6f2fcce0fc834a82e46a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 26 Apr 2024 13:04:22 +0900 Subject: [PATCH 10/46] Update docker build (#1) Update docker build and add github actions for CI --- .github/workflows/ci-build.yml | 63 ++++--- Dockerfile.datastet | 178 ++++++------------ build.gradle | 139 +++++--------- resources/config/config-docker.yml | 170 +++++++++++++++++ resources/config/config.yml | 20 +- .../grobid/core/engines/DatasetParser.java | 2 +- .../controller/DatastetController.java | 18 +- .../controller/DatastetProcessFile.java | 13 +- 8 files changed, 345 insertions(+), 258 deletions(-) create mode 100644 resources/config/config-docker.yml diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 0592034..18f9a31 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -31,35 +31,36 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} format: jacoco -# docker-build: -# needs: [ build ] -# runs-on: ubuntu-latest -# -# steps: -# # - name: Maximize build disk space -# # uses: easimon/maximize-build-space@v10 -# # with: -# # root-reserve-mb: 512 -# # swap-size-mb: 1024 -# # remove-dotnet: 'true' -# - name: Create more disk space -# run: | -# sudo rm -rf /usr/share/dotnet -# sudo rm -rf /opt/ghc -# sudo rm -rf "/usr/local/share/boost" -# sudo rm -rf "$AGENT_TOOLSDIRECTORY" -# sudo rm -rf /opt/hostedtoolcache -# - uses: actions/checkout@v4 -# - name: Build and push -# id: docker_build -# uses: mr-smithers-excellent/docker-build-push@v6 + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: +# - name: Maximize build disk space +# uses: easimon/maximize-build-space@v10 # with: -# dockerfile: Dockerfile.datastet -# username: ${{ secrets.DOCKERHUB_USERNAME }} -# password: ${{ secrets.DOCKERHUB_TOKEN }} -# image: lfoppiano/datastet -# registry: docker.io -# pushImage: ${{ github.event_name != 'pull_request' }} -# tags: latest-develop -# - name: Image digest -# run: echo ${{ steps.docker_build.outputs.digest }} +# remove-dotnet: 'true' +# remove-haskell: 'true' +# remove-codeql: 'true' +# remove-android: 'true' + - name: Create more disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /opt/hostedtoolcache + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.datastet + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Dockerfile.datastet b/Dockerfile.datastet index ed17bbf..6814850 100644 --- a/Dockerfile.datastet +++ b/Dockerfile.datastet @@ -6,138 +6,72 @@ FROM openjdk:17-jdk-slim as builder USER root -ARG GROBID_VERSION - RUN apt-get update && \ - apt-get -y --no-install-recommends install unzip - -WORKDIR /opt/grobid-source - -# gradle -COPY gradle/ ./gradle/ -COPY gradlew ./ -COPY gradle.properties ./ -COPY build.gradle ./ -COPY settings.gradle ./ - -# source -COPY grobid-home/ ./grobid-home/ -COPY grobid-core/ ./grobid-core/ -#COPY grobid-service/ ./grobid-service/ -COPY grobid-trainer/ ./grobid-trainer/ -#COPY datastet/ ./datastet/ - -# cleaning unused native libraries before packaging -RUN rm -rf grobid-home/pdf2xml -RUN rm -rf grobid-home/pdfalto/lin-32 -RUN rm -rf grobid-home/pdfalto/mac-64 -RUN rm -rf grobid-home/pdfalto/win-* -RUN rm -rf grobid-home/lib/lin-32 -RUN rm -rf grobid-home/lib/win-* -RUN rm -rf grobid-home/lib/mac-64 - -RUN ./gradlew clean assemble install --no-daemon --info --stacktrace - -WORKDIR ./datastet/ -RUN ./gradlew clean install --no-daemon --info --stacktrace + apt-get -y --no-install-recommends install apt-utils libxml2 git unzip wget + +WORKDIR /opt/grobid +RUN mkdir -p datastet-source grobid-home/models +COPY src datastet-source/src +COPY settings.gradle datastet-source/ +COPY resources/config/config-docker.yml datastet-source/resources/config/config.yml +COPY resources/models datastet-source/resources/models +COPY resources/lexicon datastet-source/resources/lexicon +COPY build.gradle datastet-source/ +COPY gradle.properties datastet-source/ +COPY gradle datastet-source/gradle/ +COPY gradlew datastet-source/ +#COPY .git datastet-source/.git +#COPY localLibs datastet-source/localLibs + +# Preparing models +WORKDIR /opt/grobid/datastet-source +RUN rm -rf /opt/grobid/grobid-home/models/* +RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info +RUN ./gradlew installModels --no-daemon --info --stacktrace \ + && rm -f /opt/grobid/grobid-home/models/*.zip + +# Preparing distribution WORKDIR /opt/grobid -#RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \ -# mv grobid-service* grobid-service -RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \ - chmod -R 755 /opt/grobid/grobid-home/pdfalto -RUN rm -rf grobid-source +RUN unzip -o /opt/grobid/datastet-source/build/distributions/datastet-*.zip -d datastet_distribution \ + && mv datastet_distribution/datastet-* datastet \ + && rm -rf /opt/grobid/datastet-source/build + +# install Pub2TEI +WORKDIR /opt/ +RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip && \ + unzip master.zip && \ + mv Pub2TEI-master Pub2TEI && \ + rm master.zip + # ------------------- # build runtime image # ------------------- -# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine -FROM tensorflow/tensorflow:2.7.0-gpu -CMD nvidia-smi +FROM lfoppiano/grobid:0.8.0-full-slim as runtime # setting locale is likely useless but to be sure ENV LANG C.UTF-8 -# update NVIDIA Cuda key (following a key rotation in April 2022) -RUN apt-get install -y wget -RUN apt-key del 7fa2af80 -RUN rm /etc/apt/sources.list.d/cuda.list -RUN rm /etc/apt/sources.list.d/nvidia-ml.list -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -RUN dpkg -i cuda-keyring_1.0-1_all.deb - -# install JRE, python and other dependencies -RUN apt-get update && \ - apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \ - openjdk-17-jre-headless openjdk-17-jdk ca-certificates-java \ - musl gfortran \ - python3 python3-pip python3-setuptools python3-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /opt/grobid - -COPY --from=builder /opt/grobid . - -RUN python3 -m pip install pip --upgrade - -# install DeLFT via pypi -RUN pip3 install requests delft==0.3.3 -# link the data directory to /data -# the current working directory will most likely be /opt/grobid -RUN mkdir -p /data \ - && ln -s /data /opt/grobid/data \ - && ln -s /data ./data - -# disable python warnings (and fix logging) -ENV PYTHONWARNINGS="ignore" - WORKDIR /opt/grobid +RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \ + && rm -rf /opt/grobid/grobid-service \ + && ln -sf datastet/resources/ resources -ENV JAVA_OPTS=-Xmx4g - -# install jep (and temporarily the matching JDK) -ENV JDK_URL=https://download.java.net/java/GA/jdk17.0.2/dfd4a8d0985749f896bed50d7138ee7f/8/GPL/openjdk-17.0.2_linux-x64_bin.tar.gz -RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz -RUN mkdir /tmp/jdk-17 -RUN tar xvfz /tmp/openjdk.tar.gz --directory /tmp/jdk-17 --strip-components 1 --no-same-owner -RUN /tmp/jdk-17/bin/javac -version -RUN JAVA_HOME=/tmp/jdk-17 pip3 install jep==4.0.2 -RUN rm -f /tmp/openjdk.tar.gz -RUN rm -rf /tmp/jdk-17 -ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH} -# remove libjep.so because we are providing our own version in the virtual env above -RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so - -# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded -# download GROBID fine-tuned models based on SciBERT if selected - -COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py . -# embeddings will be loaded when building and running tests - -RUN ln -s /opt/grobid /opt/delft +# the last command above is just a hack to make the lexicon loader working -COPY --from=builder /opt/grobid-source/datastet /opt/grobid/datastet -COPY --from=builder /root/.m2/repository/org /opt/grobid/datastet/lib/org +COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models +COPY --from=builder /opt/grobid/datastet ./datastet/ +COPY --from=builder /opt/grobid/datastet-source/resources/config/config.yml ./datastet/resources/config/ +COPY --from=builder /opt/grobid/datastet-source/resources/lexicon/ ./datastet/resources/lexicon/ -# install Pub2TEI -WORKDIR /opt/ -RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip -RUN unzip master.zip -RUN mv Pub2TEI-master Pub2TEI - -WORKDIR /opt/grobid/datastet - -RUN mkdir /opt/grobid/delft -RUN mkdir /opt/grobid/delft/delft -COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json /opt/grobid/delft/delft/resources-registry.json +COPY --from=builder /opt/grobid/datastet /opt/grobid/datastet +COPY --from=builder /opt/Pub2TEI /opt/Pub2TEI -WORKDIR /opt/grobid/datastet +VOLUME ["/opt/grobid/grobid-home/tmp"] -# trigger gradle wrapper install -RUN ./gradlew --version -RUN ./gradlew installModels && rm -rf resources/models && rm ../grobid-home/models/dataseer*.zip && rm ../grobid-home/models/context_*.zip +#WORKDIR /opt/grobid # install ELMo #RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json @@ -147,16 +81,20 @@ RUN ./gradlew installModels && rm -rf resources/models && rm ../grobid-home/mode #RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 /opt/elmo/ # this will build and load embeddings on the image forever (only if required by the config) :) -WORKDIR /opt/grobid/datastet -#RUN ./gradlew clean build test -RUN ./gradlew clean assemble --no-daemon --stacktrace --info -x test +# LF: AFAIK this is not needed at the moment as all the models are running with bert, but might +# be a solution if we want to support the GRU version +# RUN python3 preload_embeddings.py --registry ./resources-registry.json --embedding word2vec + +ARG GROBID_VERSION +ENV GROBID_VERSION=${GROBID_VERSION:-latest} +ENV DATASTET_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED" + -#CMD ["./gradlew", "run"] -CMD ["sh", "-c", "java --add-opens java.base/java.lang=ALL-UNNAMED -jar build/libs/datastet-0.8.0-onejar.jar server resources/config/config.yml"] +CMD ["./datastet/bin/datastet", "server", "datastet/resources/config/config.yml"] LABEL \ authors="The contributors" \ org.label-schema.name="datastet" \ org.label-schema.description="Image with DataStet service" \ - org.label-schema.url="https://github.com/kermitt2/datastet" \ + org.label-schema.url="https://github.com/DataSeer/datastet" \ org.label-schema.version=${GROBID_VERSION} \ No newline at end of file diff --git a/build.gradle b/build.gradle index c2556de..415ac45 100644 --- a/build.gradle +++ b/build.gradle @@ -13,8 +13,10 @@ buildscript { plugins { id 'com.github.johnrengelman.shadow' version '7.0.0' - id "de.undercouch.download" version "4.1.1" + id "de.undercouch.download" version "5.6.0" id "jacoco" + id 'distribution' + id 'application' } @@ -189,51 +191,41 @@ publishing { task install(dependsOn: publishToMavenLocal) -task mainJar(type: ShadowJar) { - zip64 true - from sourceSets.main.output - - configurations = [project.configurations.runtimeClasspath] - - from { - project.configurations.implementation.collect { - it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] - } - } -} +//task mainJar(type: ShadowJar) { +// zip64 true +// from sourceSets.main.output +// +// configurations = [project.configurations.runtimeClasspath] +// +// from { +// project.configurations.implementation.collect { +// it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] +// } +// } +//} shadowJar { - classifier = 'onejar' + archiveClassifier = 'onejar' mergeServiceFiles() zip64 true manifest { attributes 'Main-Class': 'org.grobid.core.main.batch.DatastetMain' } - + from sourceSets.main.output configurations = [project.configurations.runtimeClasspath] - - from { - project.configurations.implementation.collect { - it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] - } - } } jar { - dependsOn mainJar +// dependsOn mainJar enabled true } -tasks.withType(Tar) { - duplicatesStrategy = DuplicatesStrategy.EXCLUDE -} - -tasks.withType(Zip) { - duplicatesStrategy = DuplicatesStrategy.EXCLUDE -} +distZip.enabled = true +distTar.enabled = false +shadowDistZip.enabled = false +shadowDistTar.enabled = false artifacts { - archives jar archives shadowJar } @@ -243,69 +235,34 @@ if (grobidHome.startsWith("../")) { grobidHome = "${rootProject.rootDir}/${grobidHome}" } -task installModels(type: Copy) { - from "${rootDir}/resources/models/" - include "**" - into "${grobidHome}/models/" - - doLast { - print "Copy models under grobid-home: ${grobidHome}" - download { - src "https://grobid.s3.amazonaws.com/dataseer-binary_bert-0.3.1.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/dataseer-binary_bert-0.3.1.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/dataseer-first_bert-0.3.1.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/dataseer-first_bert-0.3.1.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/dataseer-reuse_bert-0.3.1.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/dataseer-reuse_bert-0.3.1.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/datasets-BERT_CRF-0.3.2.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/datasets-BERT_CRF-0.3.2.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert-0.3.2.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/context_bert-0.3.2.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert_used-0.3.2.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/context_bert_used-0.3.2.zip", dest: "${grobidHome}/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert_creation-0.3.2.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/context_bert_creation-0.3.2.zip", dest: "${grobidHome}/models/") +def models = [ + 'https://grobid.s3.amazonaws.com/dataseer-binary_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/dataseer-first_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/dataseer-reuse_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/datasets-BERT_CRF-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_used-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_creation-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_shared-0.3.2.zip' +] + +def installModels = tasks.register("installModels") + +models.eachWithIndex { model, index -> + def downloadTask = tasks.register("downloadModel_$index", Download) { + src(model) + dest "${grobidHome}/models/" + onlyIfNewer true + // overwrite true + } - download { - src "https://grobid.s3.amazonaws.com/context_bert_shared-0.3.2.zip" - dest "${grobidHome}/models/" - overwrite false - } - ant.unzip(src: "${grobidHome}/models/context_bert_shared-0.3.2.zip", dest: "${grobidHome}/models/") + def unzipTask = tasks.register("unzipModel_$index", Copy) { + dependsOn downloadTask + from zipTree(downloadTask.get().outputs.files.first()) + into "${grobidHome}/models/" } + + installModels.get().dependsOn(unzipTask) } //tasks.withType(JavaCompile) { diff --git a/resources/config/config-docker.yml b/resources/config/config-docker.yml new file mode 100644 index 0000000..6909ba8 --- /dev/null +++ b/resources/config/config-docker.yml @@ -0,0 +1,170 @@ +version: "0.8.0" + +corpusPath: "./resources/dataset/dataseer/corpus" +templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template" +grobidHome: "/opt/grobid/grobid-home" +tmpPath: "/opt/grobid/grobid-home/tmp/" + +# path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI +pub2teiPath: "/opt/Pub2TEI/" + +gluttonHost: "https://cloud.science-miner.com/glutton" +gluttonPort: + +# entity-fishing server information for performing entity disambiguation +# for https, indicate 443 as port +entityFishingHost: cloud.science-miner.com/nerd +entityFishingPort: 443 +#entityFishingHost: localhost +#entityFishingPort: 8090 + +# if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier +# binary classifiers perform better, but havier to use +useBinaryContextClassifiers: false + +# sequence labeling model (identify data-related sections) +models: + + # model for zones + - name: "dataseer" + engine: "wapiti" + #engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 20 + nbMaxIterations: 2000 + + # classifier model, dataset binary (datset or not dataset in the current sentence) + - name: "dataseer-binary" + engine: "delft" + delft: + # deep learning parameters + #architecture: "gru" + architecture: "bert" + #embeddings_name: "word2vec" + transformer: "allenai/scibert_scivocab_cased" + + # identification of the data type (first level hierarchy) + - name: "dataseer-first" + engine: "delft" + delft: + # deep learning parameters + #architecture: "gru" + architecture: "bert" + #embeddings_name: "word2vec" + transformer: "allenai/scibert_scivocab_cased" + + # mention context classification (reuse binary for the moment) + - name: "dataseer-reuse" + engine: "delft" + delft: + # deep learning parameters + #architecture: "gru" + architecture: "bert" + #embeddings_name: "word2vec" + transformer: "allenai/scibert_scivocab_cased" + + # model for dataset mention recognition + - name: "datasets" + #engine: "wapiti" + engine: "delft" + wapiti: + # wapiti training parameters, they will be used at training time only + epsilon: 0.00001 + window: 20 + nbMaxIterations: 2000 + delft: + # deep learning parameters + #architecture: "BidLSTM_CRF" + architecture: "BERT_CRF" + #transformer: "allenai/scibert_scivocab_cased" + transformer: "michiyasunaga/LinkBERT-basecased" + #useELMo: true + #embeddings_name: "glove-840B" + runtime: + # parameters used at runtime/prediction + max_sequence_length: 200 + #max_sequence_length: 300 + batch_size: 20 + + - name: "context" + engine: "delft" + delft: + #architecture: "gru" + #embeddings_name: "glove-840B" + architecture: "bert" + transformer: "michiyasunaga/LinkBERT-basecased" + + - name: "context_used" + engine: "delft" + delft: + #architecture: "gru" + #embeddings_name: "glove-840B" + architecture: "bert" + transformer: "michiyasunaga/LinkBERT-basecased" + + - name: "context_creation" + engine: "delft" + delft: + #architecture: "gru" + #embeddings_name: "glove-840B" + architecture: "bert" + transformer: "michiyasunaga/LinkBERT-basecased" + + - name: "context_shared" + engine: "delft" + delft: + #architecture: "gru" + #embeddings_name: "glove-840B" + architecture: "bert" + transformer: "michiyasunaga/LinkBERT-basecased" + +# Limit the maximum number of requests (0, no limit) +maxParallelRequests: 0 + +# CORS configuration for the web API service +corsAllowedOrigins: "*" +corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" +corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" + +server: + type: custom + idleTimeout: 120 seconds + applicationConnectors: + - type: http + port: 8065 + adminConnectors: + - type: http + port: 8066 + registerDefaultExceptionMappers: false + maxThreads: 2048 + maxQueuedRequests: 2048 + acceptQueueSize: 2048 + requestLog: + appenders: [] + +# these logging settings apply to the service usage mode +logging: + level: INFO + loggers: + org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" + org.glassfish.jersey.internal: "OFF" + com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" + appenders: + - type: console + threshold: INFO + timeZone: UTC + # uncomment to have the logs in json format + #layout: + # type: json +# - type: file +# currentLogFilename: logs/datastet-service.log +# threshold: INFO +# archive: true +# archivedLogFilenamePattern: logs/datastet-service-%d.log +# archivedFileCount: 7 +# timeZone: UTC + # uncomment to have the logs in json format + #layout: + # type: json diff --git a/resources/config/config.yml b/resources/config/config.yml index 7569db5..03150b7 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -133,10 +133,10 @@ server: idleTimeout: 120 seconds applicationConnectors: - type: http - port: 8060 + port: 8065 adminConnectors: - type: http - port: 8061 + port: 8066 registerDefaultExceptionMappers: false maxThreads: 2048 maxQueuedRequests: 2048 @@ -153,18 +153,18 @@ logging: com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" appenders: - type: console - threshold: WARN + threshold: INFO timeZone: UTC # uncomment to have the logs in json format #layout: # type: json - - type: file - currentLogFilename: logs/datastet-service.log - threshold: INFO - archive: true - archivedLogFilenamePattern: logs/datastet-service-%d.log - archivedFileCount: 7 - timeZone: UTC +# - type: file +# currentLogFilename: logs/datastet-service.log +# threshold: INFO +# archive: true +# archivedLogFilenamePattern: logs/datastet-service-%d.log +# archivedFileCount: 7 +# timeZone: UTC # uncomment to have the logs in json format #layout: # type: json diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 7c7bb0b..58450b0 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1311,7 +1311,7 @@ public String processXML(File file) throws Exception { } /** - * Extract all software mentions from a publisher XML file + * Extract all software mentions from a TEI XML file */ public Pair>, List> processTEIDocument(org.w3c.dom.Document doc, boolean disambiguate, diff --git a/src/main/java/org/grobid/service/controller/DatastetController.java b/src/main/java/org/grobid/service/controller/DatastetController.java index 587c15d..07cc2bc 100644 --- a/src/main/java/org/grobid/service/controller/DatastetController.java +++ b/src/main/java/org/grobid/service/controller/DatastetController.java @@ -80,7 +80,7 @@ public Response processDatasetText_post(@FormParam(TEXT) String text) { LOGGER.info(text); return DatastetProcessString.processDatsetSentence(text); } - + @Path(PATH_DATASET_SENTENCE) @Produces(MediaType.APPLICATION_JSON + ";charset=utf-8") @GET @@ -111,8 +111,20 @@ public Response processDatasetPDF(@FormDataParam(INPUT) InputStream inputStream, @Consumes(MediaType.MULTIPART_FORM_DATA) @Produces(MediaType.APPLICATION_XML) @POST - public Response processTEI(@FormDataParam(INPUT) InputStream inputStream) { - return DatastetProcessFile.processTEI(inputStream); + public Response processTEI( + @FormDataParam(INPUT) InputStream inputStream, + @FormDataParam("segmentSentences") String segmentSentences) { + return DatastetProcessFile.processTEI(inputStream, segmentSentences); + } + + @Path(PATH_DATASEER_TEI) + @Consumes(MediaType.MULTIPART_FORM_DATA) + @Produces(MediaType.APPLICATION_JSON) + @POST + public Response processTEIJsonOutput( + @FormDataParam(INPUT) InputStream inputStream, + @FormDataParam("segmentSentences") String segmentSentences) { + return DatastetProcessFile.extractTEI(inputStream, false, true); } @Path(PATH_DATASEER_JATS) diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index dd4540d..3cc83d8 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -69,8 +69,9 @@ public DatastetProcessFile() { * @param inputStream the data of origin TEI document * @return a response object which contains an enriched TEI representation of the document */ - public static Response processTEI(final InputStream inputStream) { + public static Response processTEI(final InputStream inputStream, String segmentSentences) { LOGGER.debug(methodLogIn()); + boolean segmentSentencesBool = validateTrueFalseParam(segmentSentences); String retVal = null; Response response = null; File originFile = null; @@ -84,7 +85,7 @@ public static Response processTEI(final InputStream inputStream) { } // starts conversion process - retVal = classifier.processTEI(originFile.getAbsolutePath(), true, false); + retVal = classifier.processTEI(originFile.getAbsolutePath(), segmentSentencesBool, false); if (!isResultOK(retVal)) { response = Response.status(Response.Status.NO_CONTENT).build(); @@ -516,6 +517,14 @@ public static String methodLogOut() { return "<< " + DatastetProcessFile.class.getName() + "." + Thread.currentThread().getStackTrace()[1].getMethodName(); } + private static boolean validateTrueFalseParam(String param) { + boolean booleanOutput = false; + if ((param != null) && (param.equals("1") || param.equalsIgnoreCase("true"))) { + booleanOutput = true; + } + return booleanOutput; + } + /** * Check whether the result is null or empty. */ From 981ac95dffb743c8831fa7cbc1d277cf5f930e31 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 26 Apr 2024 13:41:15 +0800 Subject: [PATCH 11/46] implement tei processing for datasets --- resources/config/config.yml | 3 +- .../org/grobid/core/data/DataseerResults.java | 37 + .../core/engines/DataseerClassifier.java | 83 +- .../engines/DatasetContextClassifier.java | 2 +- .../grobid/core/engines/DatasetParser.java | 1807 +++++++++-------- .../grobid/core/utilities/XMLUtilities.java | 56 +- .../controller/DatastetController.java | 18 +- .../service/controller/DatastetPaths.java | 2 + .../controller/DatastetProcessFile.java | 82 +- .../core/engines/DatasetParserTest.java | 18 + 10 files changed, 1151 insertions(+), 957 deletions(-) create mode 100644 src/main/java/org/grobid/core/data/DataseerResults.java create mode 100644 src/test/java/org/grobid/core/engines/DatasetParserTest.java diff --git a/resources/config/config.yml b/resources/config/config.yml index 7569db5..a0ab98f 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -4,6 +4,7 @@ corpusPath: "./resources/dataset/dataseer/corpus" templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template" grobidHome: "../../grobid/grobid-home" tmpPath: "tmp/" +#resourcesPath: "./resources" # path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI pub2teiPath: "../../Pub2TEI/" @@ -19,7 +20,7 @@ entityFishingPort: 443 #entityFishingPort: 8090 # if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier -# binary classifiers perform better, but havier to use +# binary classifiers perform better, but heavier to use useBinaryContextClassifiers: false # sequence labeling model (identify data-related sections) diff --git a/src/main/java/org/grobid/core/data/DataseerResults.java b/src/main/java/org/grobid/core/data/DataseerResults.java new file mode 100644 index 0000000..90e9e12 --- /dev/null +++ b/src/main/java/org/grobid/core/data/DataseerResults.java @@ -0,0 +1,37 @@ +package org.grobid.core.data; + +public class DataseerResults { + Double bestScore; + Double hasDatasetScore; + String bestType; + + public DataseerResults(Double bestScore, Double hasDatasetScore, String bestType) { + this.bestScore = bestScore; + this.hasDatasetScore = hasDatasetScore; + this.bestType = bestType; + } + + public Double getBestScore() { + return bestScore; + } + + public void setBestScore(Double bestScore) { + this.bestScore = bestScore; + } + + public Double getHasDatasetScore() { + return hasDatasetScore; + } + + public void setHasDatasetScore(Double hasDatasetScore) { + this.hasDatasetScore = hasDatasetScore; + } + + public String getBestType() { + return bestType; + } + + public void setBestType(String bestType) { + this.bestType = bestType; + } +} diff --git a/src/main/java/org/grobid/core/engines/DataseerClassifier.java b/src/main/java/org/grobid/core/engines/DataseerClassifier.java index e30e84d..011b169 100644 --- a/src/main/java/org/grobid/core/engines/DataseerClassifier.java +++ b/src/main/java/org/grobid/core/engines/DataseerClassifier.java @@ -1,76 +1,45 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.FileUtils; -import org.grobid.core.GrobidModels; -import org.grobid.core.analyzers.DatastetAnalyzer; -import org.grobid.core.data.BiblioItem; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.BoundingBox; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.utilities.*; -import org.grobid.core.lexicon.DatastetLexicon; -import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.engines.tagging.GrobidCRFEngine; -import org.grobid.core.engines.tagging.*; -import org.grobid.core.jni.PythonEnvironmentConfig; import org.grobid.core.jni.DeLFTClassifierModel; +import org.grobid.core.utilities.*; import org.grobid.core.utilities.GrobidConfig.ModelParameters; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; - +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.ls.DOMImplementationLS; +import org.w3c.dom.ls.LSSerializer; import org.xml.sax.InputSource; -import org.w3c.dom.*; -import javax.xml.parsers.*; -import java.io.*; -import javax.xml.transform.*; -import javax.xml.transform.dom.*; -import javax.xml.transform.stream.*; -import org.w3c.dom.traversal.DocumentTraversal; -import org.w3c.dom.traversal.NodeFilter; -import org.w3c.dom.traversal.TreeWalker; - -import org.w3c.dom.ls.*; - -import java.io.*; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.File; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; import java.util.*; import static java.nio.charset.StandardCharsets.UTF_8; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.commons.lang3.SystemUtils; -import org.apache.commons.lang3.tuple.Pair; - -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; - -//import opennlp.tools.sentdetect.SentenceDetectorME; -//import opennlp.tools.sentdetect.SentenceModel; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import java.lang.reflect.Field; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; - -import static org.apache.commons.lang3.ArrayUtils.isEmpty; - /** * Dataset identification. * diff --git a/src/main/java/org/grobid/core/engines/DatasetContextClassifier.java b/src/main/java/org/grobid/core/engines/DatasetContextClassifier.java index 3610b72..a0ee9f8 100644 --- a/src/main/java/org/grobid/core/engines/DatasetContextClassifier.java +++ b/src/main/java/org/grobid/core/engines/DatasetContextClassifier.java @@ -169,7 +169,7 @@ public List> classifyDocumentContexts(List> entities for(List datasets : entities) { for(Dataset entity : datasets) { - if (entity.getContext() != null && entity.getContext().length()>0) { + if (StringUtils.isNotBlank(entity.getContext())) { String localContext = TextUtilities.dehyphenize(entity.getContext()); localContext = localContext.replace("\n", " "); localContext = localContext.replaceAll("( )+", " "); diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 7c7bb0b..e00f0cf 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1,29 +1,29 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import nu.xom.Element; +import nu.xom.Node; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.DatastetAnalyzer; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.data.BibDataSet; -import org.grobid.core.data.Dataset; +import org.grobid.core.data.*; import org.grobid.core.data.Dataset.DatasetType; -import org.grobid.core.data.DatasetComponent; -import org.grobid.core.data.BiblioComponent; import org.grobid.core.document.Document; import org.grobid.core.document.DocumentPiece; import org.grobid.core.document.DocumentSource; import org.grobid.core.document.TEIFormatter; -import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.engines.label.DatasetTaggingLabels; import org.grobid.core.engines.label.SegmentationLabels; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.label.TaggingLabels; -import org.grobid.core.engines.label.DatasetTaggingLabels; import org.grobid.core.engines.tagging.GrobidCRFEngine; -import org.grobid.core.engines.DatasetParser; import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorDataseer; import org.grobid.core.features.FeatureFactory; import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; @@ -31,48 +31,29 @@ import org.grobid.core.layout.PDFAnnotation; import org.grobid.core.layout.PDFAnnotation.Type; import org.grobid.core.lexicon.DatastetLexicon; +import org.grobid.core.lexicon.FastMatcher; import org.grobid.core.lexicon.Lexicon; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.*; -import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.utilities.counters.impl.CntManagerFactory; -import org.grobid.core.lexicon.FastMatcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.xml.sax.InputSource; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; +import org.xml.sax.SAXException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import java.io.File; +import java.io.IOException; +import java.io.StringReader; import java.util.*; - -import nu.xom.Attribute; -import nu.xom.Element; -import nu.xom.Node; -import nu.xom.Text; - -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; -import org.apache.commons.lang3.tuple.Pair; - -import org.w3c.dom.*; -import javax.xml.parsers.*; -import javax.xml.transform.*; -import javax.xml.transform.dom.*; -import javax.xml.transform.stream.*; -import org.w3c.dom.traversal.DocumentTraversal; -import org.w3c.dom.traversal.NodeFilter; -import org.w3c.dom.traversal.TreeWalker; +import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; @@ -107,9 +88,9 @@ private static synchronized void getNewInstance(DatastetConfiguration configurat } private DatasetParser(DatastetConfiguration configuration) { - super(DatasetModels.DATASET, CntManagerFactory.getCntManager(), - GrobidCRFEngine.valueOf(configuration.getModel("datasets").engine.toUpperCase()), - configuration.getModel("datasets").delft.architecture); + super(DatasetModels.DATASET, CntManagerFactory.getCntManager(), + GrobidCRFEngine.valueOf(configuration.getModel("datasets").engine.toUpperCase()), + configuration.getModel("datasets").delft.architecture); datastetLexicon = DatastetLexicon.getInstance(); parsers = new EngineParsers(); @@ -120,10 +101,9 @@ private DatasetParser(DatastetConfiguration configuration) { /** * Sequence labelling of a list of layout tokens for identifying dataset names. * Input corresponds to a list of sentences, each sentence being itself a list of Layout tokens. - * + * * @param tokensList the list of LayoutTokens sequences to be labeled - * - * @return list of identified Dataset objects. + * @return list of identified Dataset objects. */ public List> processing(List> tokensList, boolean disambiguate) { return processing(tokensList, null, disambiguate); @@ -132,12 +112,11 @@ public List> processing(List> tokensList, boolea /** * Sequence labelling of a list of layout tokens for identifying dataset names. * Input corresponds to a list of sentences, each sentence being itself a list of Layout tokens. - * + * * @param tokensList the list of LayoutTokens sequences to be labeled - * @param pdfAnnotations the list of PDF annotation objects (URI, GOTO, GOTOR) to better control + * @param pdfAnnotations the list of PDF annotation objects (URI, GOTO, GOTOR) to better control * the recognition - * - * @return list of identified Dataset objects. + * @return list of identified Dataset objects. */ public List> processing(List> tokensList, List pdfAnnotations, boolean disambiguate) { @@ -158,7 +137,7 @@ public List> processing(List> tokensList, List

> processing(List> tokensList, List

maxTokens) maxTokens = nbTokens; - + //inputs.add(input.toString()); input.append("\n\n"); total++; @@ -210,11 +189,11 @@ public List> processing(List> tokensList, List

bufferLocalDatasetcomponents = resultExtractionLayoutTokens(resBlocks[i], tokens, text); List localDatasetcomponentOffsets = new ArrayList<>(); - for(DatasetComponent localDatasetcomponent : localDatasetcomponents) { + for (DatasetComponent localDatasetcomponent : localDatasetcomponents) { localDatasetcomponentOffsets.add(localDatasetcomponent.getOffsets()); } - for(DatasetComponent component : bufferLocalDatasetcomponents) { - if (overlapsPosition(localDatasetcomponentOffsets, component.getOffsets())) + for (DatasetComponent component : bufferLocalDatasetcomponents) { + if (overlapsPosition(localDatasetcomponentOffsets, component.getOffsets())) continue; localDatasetcomponents.add(component); } @@ -229,7 +208,7 @@ public List> processing(List> tokensList, List

indexToBeFiltered = new ArrayList<>(); int k = 0; - for(Dataset entity : localDatasets) { + for (Dataset entity : localDatasets) { if (entity.getDatasetName() != null) { String term = entity.getDatasetName().getNormalizedForm(); if (term == null || term.length() == 0) { @@ -243,7 +222,7 @@ public List> processing(List> tokensList, List

0) { - for(int j=indexToBeFiltered.size()-1; j>= 0; j--) { + for (int j = indexToBeFiltered.size() - 1; j >= 0; j--) { localDatasets.remove(indexToBeFiltered.get(j).intValue()); } } @@ -255,7 +234,7 @@ public List> processing(List> tokensList, List

(); k = 0; - for(Dataset entity : localDatasets) { + for (Dataset entity : localDatasets) { if (entity.isFiltered()) { indexToBeFiltered.add(Integer.valueOf(k)); } @@ -263,7 +242,7 @@ public List> processing(List> tokensList, List

0) { - for(int j=indexToBeFiltered.size()-1; j>= 0; j--) { + for (int j = indexToBeFiltered.size() - 1; j >= 0; j--) { localDatasets.remove(indexToBeFiltered.get(j).intValue()); } } @@ -279,7 +258,7 @@ public List> processing(List> tokensList, List

resultExtractionLayoutTokens(String result, List tokenizations, String text) { List datasetComponents = new ArrayList<>(); - + //String text = LayoutTokensUtil.toText(tokenizations); TaggingTokenClusteror clusteror = new TaggingTokenClusteror(DatasetModels.DATASET, result, tokenizations); @@ -294,19 +273,19 @@ private List resultExtractionLayoutTokens(String result, List< } TaggingLabel clusterLabel = cluster.getTaggingLabel(); Engine.getCntManager().i(clusterLabel); - + //String clusterText = LayoutTokensUtil.toText(cluster.concatTokens()); List theTokens = cluster.concatTokens(); // remove possible trailing superscript number tokens, this is very unfrequent // but it looks bad when it happens int indexLastToken = theTokens.size(); - for(int j=theTokens.size()-1; j>=0; j--) { + for (int j = theTokens.size() - 1; j >= 0; j--) { LayoutToken lastToken = theTokens.get(j); - if (lastToken.isSuperscript() && - lastToken.getText() != null && - lastToken.getText().length() > 0 && - lastToken.getText().matches("[0-9]+")) { + if (lastToken.isSuperscript() && + lastToken.getText() != null && + lastToken.getText().length() > 0 && + lastToken.getText().matches("[0-9]+")) { indexLastToken--; } else { break; @@ -316,12 +295,12 @@ private List resultExtractionLayoutTokens(String result, List< if (indexLastToken != theTokens.size()) { theTokens = theTokens.subList(0, indexLastToken); } - - if ((pos < text.length()-1) && (text.charAt(pos) == ' ')) + + if ((pos < text.length() - 1) && (text.charAt(pos) == ' ')) pos += 1; - if ((pos < text.length()-1) && (text.charAt(pos) == '\n')) + if ((pos < text.length() - 1) && (text.charAt(pos) == '\n')) pos += 1; - + int endPos = pos; boolean start = true; for (LayoutToken token : theTokens) { @@ -337,9 +316,9 @@ private List resultExtractionLayoutTokens(String result, List< } } - if ((endPos > 0) && (text.length() >= endPos) && (text.charAt(endPos-1) == '\n')) + if ((endPos > 0) && (text.length() >= endPos) && (text.charAt(endPos - 1) == '\n')) endPos--; - if ((endPos > 0) && (text.length() >= endPos) && (text.charAt(endPos-1) == ' ')) + if ((endPos > 0) && (text.length() >= endPos) && (text.charAt(endPos - 1) == ' ')) endPos--; if (endPos > text.length()) @@ -352,7 +331,7 @@ private List resultExtractionLayoutTokens(String result, List< dataset = new DatasetComponent(DatasetType.DATASET, text.substring(pos, endPos)); } else if (clusterLabel.equals(DatasetTaggingLabels.DATA_DEVICE)) { dataset = new DatasetComponent(DatasetType.DATA_DEVICE, text.substring(pos, endPos)); - } + } if (dataset != null) { dataset.setOffsetStart(pos); @@ -365,10 +344,10 @@ private List resultExtractionLayoutTokens(String result, List< dataset.setBoundingBoxes(boundingBoxes); // if we just have junk/number, this is not a valid dataset name - if (dataset.getNormalizedForm() != null && - dataset.getNormalizedForm().length() > 0 && - !dataset.getNormalizedForm().matches("[0-9\\(\\)/\\[\\]\\,\\.\\:\\-\\+\\; ]+") && - !(DatastetLexicon.getInstance().isEnglishStopword(dataset.getNormalizedForm()))) { + if (dataset.getNormalizedForm() != null && + dataset.getNormalizedForm().length() > 0 && + !dataset.getNormalizedForm().matches("[0-9\\(\\)/\\[\\]\\,\\.\\:\\-\\+\\; ]+") && + !(DatastetLexicon.getInstance().isEnglishStopword(dataset.getNormalizedForm()))) { datasetComponents.add(dataset); } dataset = null; @@ -382,8 +361,8 @@ private List resultExtractionLayoutTokens(String result, List< private List groupByEntities(List components, List tokens, String text) { List localDatasets = new ArrayList<>(); - Dataset localDataset = null; - for(DatasetComponent localComponent : components) { + Dataset localDataset = null; + for (DatasetComponent localComponent : components) { if (localComponent.getType() == DatasetType.DATASET_NAME) { if (localDataset != null) { localDataset.setContext(text); @@ -402,13 +381,13 @@ private List groupByEntities(List components, List addUrlComponents(List sentenceTokens, - List existingComponents, - String text, + private List addUrlComponents(List sentenceTokens, + List existingComponents, + String text, List pdfAnnotations) { // positions for lexical match List urlPositions = DatasetParser.characterPositionsUrlPattern(sentenceTokens, pdfAnnotations, text); List existingPositions = new ArrayList<>(); - for(DatasetComponent existingComponent : existingComponents) { + for (DatasetComponent existingComponent : existingComponents) { existingPositions.add(existingComponent.getOffsets()); } // note: url positions are token index, not character offsets - for(OffsetPosition urlPosition : urlPositions) { + for (OffsetPosition urlPosition : urlPositions) { if (overlapsPosition(existingPositions, urlPosition)) { continue; } @@ -446,8 +425,8 @@ private List addUrlComponents(List sentenceTokens List urlTokens = new ArrayList<>(); int tokenPos = 0; int tokenIndex = 0; - for(LayoutToken localToken : sentenceTokens) { - if (startPos <= tokenPos && (tokenPos+localToken.getText().length() <= endPos) ) { + for (LayoutToken localToken : sentenceTokens) { + if (startPos <= tokenPos && (tokenPos + localToken.getText().length() <= endPos)) { urlTokens.add(localToken); if (startTokenIndex == -1) startTokenIndex = tokenIndex; @@ -463,10 +442,10 @@ private List addUrlComponents(List sentenceTokens // to refine the url position/recognition, check overlapping PDF annotation PDFAnnotation targetAnnotation = null; - if (pdfAnnotations != null && urlTokens.size()>0) { - LayoutToken lastToken = urlTokens.get(urlTokens.size()-1); + if (pdfAnnotations != null && urlTokens.size() > 0) { + LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); for (PDFAnnotation pdfAnnotation : pdfAnnotations) { - if (pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI) { + if (pdfAnnotation.getType() != null && pdfAnnotation.getType() == Type.URI) { if (pdfAnnotation.cover(lastToken)) { //System.out.println("found overlapping PDF annotation for URL: " + pdfAnnotation.getDestination() + " in sentence: " + text); targetAnnotation = pdfAnnotation; @@ -508,50 +487,119 @@ private List addUrlComponents(List sentenceTokens } /** - * Sequence labelling of a string for identifying dataset names. - * - * @param tokens the list of LayoutTokens to be labeled - * - * @return list of identified Dataset objects. + * Sequence labelling of a string for identifying dataset names. */ public List processingString(String input, boolean disambiguate) { List> tokensList = new ArrayList<>(); input = UnicodeUtil.normaliseText(input); tokensList.add(analyzer.tokenizeWithLayoutToken(input)); List> result = processing(tokensList, disambiguate); - if (result != null && result.size()>0) + if (result != null && result.size() > 0) return result.get(0); - else + else return new ArrayList(); } + private List classifyWithDataseerClassifier(List allSentences) { + // pre-process classification of every sentences in batch + if (this.dataseerClassifier == null) + dataseerClassifier = DataseerClassifier.getInstance(); + + int totalClassificationNodes = 0; + + List results = new ArrayList<>(); + + try { + String jsonClassification = dataseerClassifier.classify(allSentences); + //System.out.println(jsonClassification); + + //List hasDatasets = new ArrayList<>(); + ObjectMapper mapper = new ObjectMapper(); + try { + //System.out.println(jsonClassification); + JsonNode root = mapper.readTree(jsonClassification); + JsonNode classificationsNode = root.findPath("classifications"); + if ((classificationsNode != null) && (!classificationsNode.isMissingNode())) { + Iterator ite = classificationsNode.elements(); + + while (ite.hasNext()) { + JsonNode classificationNode = ite.next(); + Iterator iterator = classificationNode.fieldNames(); + Map scoresPerDatatypes = new TreeMap<>(); + double hasDatasetScore = 0.0; + while (iterator.hasNext()) { + String field = iterator.next(); + if (field.equals("has_dataset")) { + JsonNode hasDatasetNode = classificationNode.findPath("has_dataset"); + if ((hasDatasetNode != null) && (!hasDatasetNode.isMissingNode())) { + hasDatasetScore = hasDatasetNode.doubleValue(); + } + } else if (field.equals("text")) { + String localSentence = classificationNode.get("text").textValue(); + // the following should never happen + if (!localSentence.equals(allSentences.get(totalClassificationNodes))) { + System.out.println("sentence, got: " + localSentence); + System.out.println("\texpecting: " + allSentences.get(totalClassificationNodes)); + } + } else if (!field.equals("no_dataset")) { + scoresPerDatatypes.put(field, classificationNode.get(field).doubleValue()); + } + } + + double bestScore = 0.0; + String bestType = null; + for (Map.Entry entry : scoresPerDatatypes.entrySet()) { + if (entry.getValue() > bestScore) { + bestScore = entry.getValue(); + bestType = entry.getKey(); + } + } + + results.add(new DataseerResults(bestScore, hasDatasetScore, bestType)); + + totalClassificationNodes++; + } + } + } catch (JsonProcessingException e) { + LOGGER.error("Parsing of dataseer classifier JSON result failed", e); + } catch (Exception e) { + LOGGER.error("Error when applying dataseer sentence classifier", e); + } + + } catch (Exception e) { + e.printStackTrace(); + } + + return results; + } + public List> processingStrings(List inputs, boolean disambiguate) { List> tokensList = new ArrayList<>(); - for(String input : inputs) { + for (String input : inputs) { input = UnicodeUtil.normaliseText(input); tokensList.add(analyzer.tokenizeWithLayoutToken(input)); } return processing(tokensList, disambiguate); } - public Pair>,Document> processPDF(File file, - boolean disambiguate) throws IOException { + public Pair>, Document> processPDF(File file, + boolean disambiguate) throws IOException { List> entities = new ArrayList<>(); Document doc = null; try { GrobidAnalysisConfig config = - GrobidAnalysisConfig.builder() - .consolidateHeader(0) - .consolidateCitations(0) - .build(); + GrobidAnalysisConfig.builder() + .consolidateHeader(0) + .consolidateCitations(0) + .build(); - DocumentSource documentSource = - DocumentSource.fromPdf(file, config.getStartPage(), config.getEndPage()); + DocumentSource documentSource = + DocumentSource.fromPdf(file, config.getStartPage(), config.getEndPage()); doc = parsers.getSegmentationParser().processing(documentSource, config); // process bibliographical reference section first List resCitations = parsers.getCitationParser(). - processingReferenceSection(doc, parsers.getReferenceSegmenterParser(), config.getConsolidateCitations()); + processingReferenceSection(doc, parsers.getReferenceSegmenterParser(), config.getConsolidateCitations()); doc.setBibDataSets(resCitations); @@ -575,7 +623,7 @@ public Pair>,Document> processPDF(File file, SortedSet documentParts = doc.getDocumentPart(SegmentationLabels.HEADER); BiblioItem resHeader = null; if (documentParts != null) { - Pair> headerFeatured = parsers.getHeaderParser().getSectionHeaderFeatured(doc, documentParts); + Pair> headerFeatured = parsers.getHeaderParser().getSectionHeaderFeatured(doc, documentParts); String header = headerFeatured.getLeft(); List tokenizationHeader = headerFeatured.getRight(); String labeledResult = null; @@ -584,7 +632,7 @@ public Pair>,Document> processPDF(File file, resHeader = new BiblioItem(); try { resHeader.generalResultMappingHeader(labeledResult, tokenizationHeader); - } catch(Exception e) { + } catch (Exception e) { LOGGER.error("Problem decoding header labeling, header will be skipped", e); resHeader = null; } @@ -596,7 +644,7 @@ public Pair>,Document> processPDF(File file, selectedLayoutTokenSequences.add(titleTokens); relevantSectionsNamedDatasets.add(false); relevantSectionsImplicitDatasets.add(false); - } + } // abstract List abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT); @@ -604,7 +652,7 @@ public Pair>,Document> processPDF(File file, selectedLayoutTokenSequences.add(abstractTokens); relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(false); - } + } // keywords List keywordTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_KEYWORD); @@ -630,14 +678,14 @@ public Pair>,Document> processPDF(File file, LayoutTokenization tokenizationBody = featSeg.getRight(); String rese = null; - if ( (bodytext != null) && (bodytext.trim().length() > 0) ) { + if ((bodytext != null) && (bodytext.trim().length() > 0)) { rese = parsers.getFullTextParser().label(bodytext); } else { LOGGER.debug("Fulltext model: The input to the sequence labelling processing is empty"); } - TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, - tokenizationBody.getTokenization(), true); + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, + tokenizationBody.getTokenization(), true); bodyClusters = clusteror.cluster(); List curParagraphTokens = null; TaggingLabel lastClusterLabel = null; @@ -652,14 +700,14 @@ public Pair>,Document> processPDF(File file, List localTokenization = cluster.concatTokens(); if ((localTokenization == null) || (localTokenization.size() == 0)) continue; - + if (TEIFormatter.MARKER_LABELS.contains(clusterLabel)) { if (curParagraphTokens == null) curParagraphTokens = new ArrayList<>(); //curParagraphTokens.addAll(localTokenization); } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) || clusterLabel.equals(TaggingLabels.ITEM)) { //|| clusterLabel.equals(TaggingLabels.SECTION) { - if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { + if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { if (curParagraphTokens != null) { selectedLayoutTokenSequences.add(curParagraphTokens); relevantSectionsNamedDatasets.add(true); @@ -705,14 +753,14 @@ public Pair>,Document> processPDF(File file, LayoutTokenization tokenizationBody = featSeg.getRight(); String rese = null; - if ( (bodytext != null) && (bodytext.trim().length() > 0) ) { + if ((bodytext != null) && (bodytext.trim().length() > 0)) { rese = parsers.getFullTextParser().label(bodytext); } else { LOGGER.debug("Fulltext model applied to Annex: The input to the sequence labelling processing is empty"); } - TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, - tokenizationBody.getTokenization(), true); + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, + tokenizationBody.getTokenization(), true); List bodyAnnexClusters = clusteror.cluster(); List curParagraphTokens = null; TaggingLabel lastClusterLabel = null; @@ -729,7 +777,7 @@ public Pair>,Document> processPDF(File file, List localTokenization = cluster.concatTokens(); if ((localTokenization == null) || (localTokenization.size() == 0)) continue; - + if (TEIFormatter.MARKER_LABELS.contains(clusterLabel)) { if (previousSection == null || previousSection.equals("das")) { if (curParagraphTokens == null) @@ -737,7 +785,7 @@ public Pair>,Document> processPDF(File file, curParagraphTokens.addAll(localTokenization); } } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) || clusterLabel.equals(TaggingLabels.ITEM)) { - if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { + if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { if (curParagraphTokens != null && previousSection == null) { selectedLayoutTokenSequences.add(curParagraphTokens); relevantSectionsNamedDatasets.add(true); @@ -746,7 +794,7 @@ public Pair>,Document> processPDF(File file, selectedLayoutTokenSequences.add(curParagraphTokens); relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); - } + } curParagraphTokens = new ArrayList<>(); } if (curParagraphTokens == null) @@ -777,7 +825,7 @@ public Pair>,Document> processPDF(File file, selectedLayoutTokenSequences.add(curParagraphTokens); relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); - } + } } } @@ -797,7 +845,141 @@ public Pair>,Document> processPDF(File file, documentParts = doc.getDocumentPart(SegmentationLabels.AVAILABILITY); if (documentParts != null) { availabilityTokens = doc.getTokenizationParts(documentParts, doc.getTokenizations()); - if (availabilityTokens != null) { + if (availabilityTokens != null) { // we attach and match bibliographical reference callout + TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); + // second pass, body + if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { + List bibRefComponents = new ArrayList(); + for (TaggingTokenCluster cluster : bodyClusters) { + if (cluster == null) { + continue; + } + + TaggingLabel clusterLabel = cluster.getTaggingLabel(); + + List localTokenization = cluster.concatTokens(); + if ((localTokenization == null) || (localTokenization.size() == 0)) + continue; + + if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { + List refTokens = TextUtilities.dehyphenize(localTokenization); + String chunkRefString = LayoutTokensUtil.toText(refTokens); + + List refNodes = formatter.markReferencesTEILuceneBased(refTokens, + doc.getReferenceMarkerMatcher(), + true, // generate coordinates + false); // do not mark unsolved callout as ref + + if (refNodes != null) { + for (Node refNode : refNodes) { + if (refNode instanceof Element) { + // get the bib ref key + String refKey = ((Element) refNode).getAttributeValue("target"); + + if (refKey == null) + continue; + + int refKeyVal = -1; + if (refKey.startsWith("#b")) { + refKey = refKey.substring(2, refKey.length()); + try { + refKeyVal = Integer.parseInt(refKey); + } catch (Exception e) { + LOGGER.warn("Invalid ref identifier: " + refKey); + } + } + if (refKeyVal == -1) + continue; + + // get the bibref object + BibDataSet resBib = resCitations.get(refKeyVal); + if (resBib != null) { + BiblioComponent biblioComponent = new BiblioComponent(resBib.getResBib(), refKeyVal); + biblioComponent.setRawForm(refNode.getValue()); + biblioComponent.setOffsetStart(refTokens.get(0).getOffset()); + biblioComponent.setOffsetEnd(refTokens.get(refTokens.size() - 1).getOffset() + + refTokens.get(refTokens.size() - 1).getText().length()); + List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); + biblioComponent.setBoundingBoxes(boundingBoxes); + bibRefComponents.add(biblioComponent); + } + } + } + } + } + } + + + if (bibRefComponents.size() > 0) { + // attach references to dataset entities + entities = attachRefBib(entities, bibRefComponents); + } + + // consolidate the attached ref bib (we don't consolidate all bibliographical references + // to avoid useless costly computation) + List citationsToConsolidate = new ArrayList(); + List consolidated = new ArrayList(); + for (List datasets : entities) { + for (Dataset entity : datasets) { + if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { + List bibRefs = entity.getBibRefs(); + for (BiblioComponent bibRef : bibRefs) { + Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); + if (!consolidated.contains(refKeyVal)) { + citationsToConsolidate.add(resCitations.get(refKeyVal)); + consolidated.add(refKeyVal); + } + } + } + } + } + + try { + Consolidation consolidator = Consolidation.getInstance(); + Map resConsolidation = consolidator.consolidate(citationsToConsolidate); + for (int j = 0; j < citationsToConsolidate.size(); j++) { + BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); + BiblioItem bibo = resConsolidation.get(j); + if (bibo != null) { + BiblioItem.correct(resCitation, bibo); + } + } + } catch (Exception e) { + throw new GrobidException( + "An exception occured while running consolidation on bibliographical references.", e); + } + + // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. + for (List datasets1 : entities) { + for (Dataset entity1 : datasets1) { + if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { + for (List datasets2 : entities) { + for (Dataset entity2 : datasets2) { + if (entity2.getBibRefs() != null) { + continue; + } + if ((entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && + entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && + (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || + entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) + ) { + List newBibRefs = new ArrayList<>(); + for (BiblioComponent bibComponent : entity1.getBibRefs()) { + newBibRefs.add(new BiblioComponent(bibComponent)); + } + entity2.setBibRefs(newBibRefs); + } + } + } + } + } + } + } + + // mark datasets present in Data Availability section(s) + if (CollectionUtils.isNotEmpty(availabilityTokens)) { + entities = markDAS(entities, availabilityTokens); + } selectedLayoutTokenSequences.add(availabilityTokens); relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); @@ -810,15 +992,15 @@ public Pair>,Document> processPDF(File file, List sentenceOffsetStarts = new ArrayList<>(); int zoneIndex = 0; int accumulatedOffset = 0; - Map mapSentencesToZones = new HashMap<>(); - for(List layoutTokens : selectedLayoutTokenSequences) { + Map mapSentencesToZones = new HashMap<>(); + for (List layoutTokens : selectedLayoutTokenSequences) { layoutTokens = DatastetAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens); - if ( (layoutTokens == null) || (layoutTokens.size() == 0) ) { + if ((layoutTokens == null) || (layoutTokens.size() == 0)) { //allLayoutTokens.add(null); //allSentences.add(null); List dummyLayoutTokens = new ArrayList<>(); - dummyLayoutTokens. add(new LayoutToken("dummy")); + dummyLayoutTokens.add(new LayoutToken("dummy")); allLayoutTokens.add(dummyLayoutTokens); //System.out.println("dummy sentence at " + (allSentences.size())); allSentences.add("dummy"); @@ -827,27 +1009,27 @@ public Pair>,Document> processPDF(File file, } accumulatedOffset = layoutTokens.get(0).getOffset(); - + // segment into sentences String localText = LayoutTokensUtil.toText(layoutTokens); List urlPositions = DatasetParser.characterPositionsUrlPattern(layoutTokens, pdfAnnotations, localText); - List sentencePositions = - SentenceUtilities.getInstance().runSentenceDetection(localText, urlPositions, layoutTokens, null); + List sentencePositions = + SentenceUtilities.getInstance().runSentenceDetection(localText, urlPositions, layoutTokens, null); if (sentencePositions == null) { sentencePositions = new ArrayList<>(); sentencePositions.add(new OffsetPosition(0, localText.length())); } - for(OffsetPosition sentencePosition : sentencePositions) { + for (OffsetPosition sentencePosition : sentencePositions) { int startPos = sentencePosition.start; int endPos = sentencePosition.end; List sentenceTokens = new ArrayList<>(); int pos = 0; - for(LayoutToken token : layoutTokens) { - if (startPos <= pos && (pos+token.getText().length()) <= endPos) { + for (LayoutToken token : layoutTokens) { + if (startPos <= pos && (pos + token.getText().length()) <= endPos) { sentenceTokens.add(token); - } else if (endPos < (pos+token.getText().length())) { + } else if (endPos < (pos + token.getText().length())) { break; } pos += token.getText().length(); @@ -855,8 +1037,8 @@ public Pair>,Document> processPDF(File file, allLayoutTokens.add(sentenceTokens); allSentences.add(localText.substring(startPos, endPos)); - mapSentencesToZones.put(allSentences.size()-1, zoneIndex); - sentenceOffsetStarts.add(accumulatedOffset+startPos); + mapSentencesToZones.put(allSentences.size() - 1, zoneIndex); + sentenceOffsetStarts.add(accumulatedOffset + startPos); } zoneIndex++; } @@ -872,77 +1054,7 @@ public Pair>,Document> processPDF(File file, //System.out.println("mapSentencesToZones size: " + mapSentencesToZones.size()); //System.out.println("relevantSections size: " + relevantSectionsNamedDatasets.size()); - // pre-process classification of every sentences in batch - if (this.dataseerClassifier == null) - dataseerClassifier = DataseerClassifier.getInstance(); - - int totalClassificationNodes = 0; - - List bestScores = new ArrayList<>(); - List bestTypes = new ArrayList<>(); - List hasDatasetScores = new ArrayList<>(); - try { - String jsonClassification = dataseerClassifier.classify(allSentences); - //System.out.println(jsonClassification); - - //List hasDatasets = new ArrayList<>(); - ObjectMapper mapper = new ObjectMapper(); - try { - //System.out.println(jsonClassification); - JsonNode root = mapper.readTree(jsonClassification); - JsonNode classificationsNode = root.findPath("classifications"); - if ((classificationsNode != null) && (!classificationsNode.isMissingNode())) { - Iterator ite = classificationsNode.elements(); - - while(ite.hasNext()) { - JsonNode classificationNode = ite.next(); - Iterator iterator = classificationNode.fieldNames(); - Map scoresPerDatatypes = new TreeMap<>(); - double hasDatasetScore = 0.0; - while(iterator.hasNext()) { - String field = iterator.next(); - if (field.equals("has_dataset")) { - JsonNode hasDatasetNode = classificationNode.findPath("has_dataset"); - if ((hasDatasetNode != null) && (!hasDatasetNode.isMissingNode())) { - hasDatasetScore = hasDatasetNode.doubleValue(); - } - } else if (field.equals("text")) { - String localSentence = classificationNode.get("text").textValue(); - // the following should never happen - if (!localSentence.equals(allSentences.get(totalClassificationNodes))) { - System.out.println("sentence, got: " + localSentence); - System.out.println("\texpecting: " + allSentences.get(totalClassificationNodes)); - } - } else if (!field.equals("no_dataset")) { - scoresPerDatatypes.put(field, classificationNode.get(field).doubleValue()); - } - } - - double bestScore = 0.0; - String bestType = null; - for (Map.Entry entry : scoresPerDatatypes.entrySet()) { - if (entry.getValue() > bestScore) { - bestScore = entry.getValue(); - bestType = entry.getKey(); - } - } - - bestTypes.add(bestType); - bestScores.add(bestScore); - hasDatasetScores.add(hasDatasetScore); - - totalClassificationNodes++; - } - } - } catch(JsonProcessingException e) { - LOGGER.error("Parsing of dataseer classifier JSON result failed", e); - } catch(Exception e) { - LOGGER.error("Error when applying dataseer sentence classifier", e); - } - - } catch(Exception e) { - e.printStackTrace(); - } + List results = classifyWithDataseerClassifier(allSentences); //System.out.println("total data sentence classifications: " + totalClassificationNodes); //System.out.println("bestTypes size: " + bestTypes.size()); @@ -950,20 +1062,21 @@ public Pair>,Document> processPDF(File file, //System.out.println("hasDatasetScores size: " + hasDatasetScores.size()); int i = 0; - for(List localDatasets : entities) { - if (localDatasets == null || localDatasets.size() == 0) { + for (List localDatasets : entities) { + if (CollectionUtils.isNotEmpty(localDatasets)) { i++; continue; } - for(Dataset localDataset : localDatasets) { + for (Dataset localDataset : localDatasets) { if (localDataset == null) { continue; } + DataseerResults result = results.get(i); - if (localDataset.getType() == DatasetType.DATASET && (bestTypes.get(i) != null) && localDataset.getDataset() != null) { - localDataset.getDataset().setBestDataType(bestTypes.get(i)); - localDataset.getDataset().setBestDataTypeScore(bestScores.get(i)); - localDataset.getDataset().setHasDatasetScore(hasDatasetScores.get(i)); + if (localDataset.getType() == DatasetType.DATASET && (result.getBestType() != null) && localDataset.getDataset() != null) { + localDataset.getDataset().setBestDataType(result.getBestType()); + localDataset.getDataset().setBestDataTypeScore(result.getBestScore()); + localDataset.getDataset().setHasDatasetScore(result.getHasDatasetScore()); } } i++; @@ -994,16 +1107,16 @@ public Pair>,Document> processPDF(File file, int index = 0; List> newEntities = new ArrayList<>(); for (List sentenceTokens : allLayoutTokens) { - List localEntities = propagateLayoutTokenSequence(sentenceTokens, - entities.get(index), - termProfiles, - termPattern, - placeTaken.get(index), - frequencies, - sentenceOffsetStarts.get(index)); + List localEntities = propagateLayoutTokenSequence(sentenceTokens, + entities.get(index), + termProfiles, + termPattern, + placeTaken.get(index), + frequencies, + sentenceOffsetStarts.get(index)); if (localEntities != null) { Collections.sort(localEntities); - + // revisit and attach URL component localEntities = attachUrlComponents(localEntities, sentenceTokens, allSentences.get(index), pdfAnnotations); } @@ -1019,7 +1132,7 @@ public Pair>,Document> processPDF(File file, // filter implicit datasets based on selected relevant data section List> filteredEntities = new ArrayList<>(); index = 0; - for(List localDatasets : entities) { + for (List localDatasets : entities) { List filteredLocalEntities = new ArrayList<>(); if (mapSentencesToZones.get(index) == null) { @@ -1031,7 +1144,7 @@ public Pair>,Document> processPDF(File file, if (currentZoneObject == null) { index++; continue; - } + } int currentZone = currentZoneObject.intValue(); @@ -1042,27 +1155,27 @@ public Pair>,Document> processPDF(File file, for (Dataset localDataset : localDatasets) { boolean referenceDataSource = false; - if (localDataset.getUrl() != null && - DatastetLexicon.getInstance().isDatasetURLorDOI(localDataset.getUrl().getNormalizedForm())) { + if (localDataset.getUrl() != null && + DatastetLexicon.getInstance().isDatasetURLorDOI(localDataset.getUrl().getNormalizedForm())) { referenceDataSource = true; } if (localDataset.getType() == DatasetType.DATASET && - !relevantSectionsImplicitDatasets.get(currentZone) && !referenceDataSource) { + !relevantSectionsImplicitDatasets.get(currentZone) && !referenceDataSource) { continue; - } + } if (localDataset.getType() == DatasetType.DATASET_NAME && - !relevantSectionsNamedDatasets.get(currentZone)) { + !relevantSectionsNamedDatasets.get(currentZone)) { continue; - } + } if (localDataset.getType() == DatasetType.DATASET && - localDataset.getDataset() != null && - localDataset.getDataset().getHasDatasetScore() < 0.5 && !referenceDataSource) { + localDataset.getDataset() != null && + localDataset.getDataset().getHasDatasetScore() < 0.5 && !referenceDataSource) { continue; - } - + } + filteredLocalEntities.add(localDataset); } @@ -1074,7 +1187,7 @@ public Pair>,Document> processPDF(File file, // we attach and match bibliographical reference callout TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); // second pass, body - if ( (bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0) ) { + if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { List bibRefComponents = new ArrayList(); for (TaggingTokenCluster cluster : bodyClusters) { if (cluster == null) { @@ -1092,16 +1205,16 @@ public Pair>,Document> processPDF(File file, String chunkRefString = LayoutTokensUtil.toText(refTokens); List refNodes = formatter.markReferencesTEILuceneBased(refTokens, - doc.getReferenceMarkerMatcher(), - true, // generate coordinates - false); // do not mark unsolved callout as ref + doc.getReferenceMarkerMatcher(), + true, // generate coordinates + false); // do not mark unsolved callout as ref - if (refNodes != null) { + if (refNodes != null) { for (nu.xom.Node refNode : refNodes) { if (refNode instanceof Element) { // get the bib ref key - String refKey = ((Element)refNode).getAttributeValue("target"); - + String refKey = ((Element) refNode).getAttributeValue("target"); + if (refKey == null) continue; @@ -1110,7 +1223,7 @@ public Pair>,Document> processPDF(File file, refKey = refKey.substring(2, refKey.length()); try { refKeyVal = Integer.parseInt(refKey); - } catch(Exception e) { + } catch (Exception e) { LOGGER.warn("Invalid ref identifier: " + refKey); } } @@ -1123,8 +1236,8 @@ public Pair>,Document> processPDF(File file, BiblioComponent biblioComponent = new BiblioComponent(resBib.getResBib(), refKeyVal); biblioComponent.setRawForm(refNode.getValue()); biblioComponent.setOffsetStart(refTokens.get(0).getOffset()); - biblioComponent.setOffsetEnd(refTokens.get(refTokens.size()-1).getOffset() + - refTokens.get(refTokens.size()-1).getText().length()); + biblioComponent.setOffsetEnd(refTokens.get(refTokens.size() - 1).getOffset() + + refTokens.get(refTokens.size() - 1).getText().length()); List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); biblioComponent.setBoundingBoxes(boundingBoxes); bibRefComponents.add(biblioComponent); @@ -1145,11 +1258,11 @@ public Pair>,Document> processPDF(File file, // to avoid useless costly computation) List citationsToConsolidate = new ArrayList(); List consolidated = new ArrayList(); - for(List datasets : entities) { - for(Dataset entity : datasets) { + for (List datasets : entities) { + for (Dataset entity : datasets) { if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { List bibRefs = entity.getBibRefs(); - for(BiblioComponent bibRef: bibRefs) { + for (BiblioComponent bibRef : bibRefs) { Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); if (!consolidated.contains(refKeyVal)) { citationsToConsolidate.add(resCitations.get(refKeyVal)); @@ -1162,35 +1275,35 @@ public Pair>,Document> processPDF(File file, try { Consolidation consolidator = Consolidation.getInstance(); - Map resConsolidation = consolidator.consolidate(citationsToConsolidate); - for(int j=0; j resConsolidation = consolidator.consolidate(citationsToConsolidate); + for (int j = 0; j < citationsToConsolidate.size(); j++) { BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); BiblioItem bibo = resConsolidation.get(j); if (bibo != null) { BiblioItem.correct(resCitation, bibo); } } - } catch(Exception e) { + } catch (Exception e) { throw new GrobidException( - "An exception occured while running consolidation on bibliographical references.", e); + "An exception occured while running consolidation on bibliographical references.", e); } // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. - for(List datasets1 : entities) { - for(Dataset entity1 : datasets1) { + for (List datasets1 : entities) { + for (Dataset entity1 : datasets1) { if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { - for(List datasets2 : entities) { - for(Dataset entity2 : datasets2) { + for (List datasets2 : entities) { + for (Dataset entity2 : datasets2) { if (entity2.getBibRefs() != null) { continue; } - if ( (entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && - entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && - (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || - entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) - ) { + if ((entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && + entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && + (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || + entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) + ) { List newBibRefs = new ArrayList<>(); - for(BiblioComponent bibComponent : entity1.getBibRefs()) { + for (BiblioComponent bibComponent : entity1.getBibRefs()) { newBibRefs.add(new BiblioComponent(bibComponent)); } entity2.setBibRefs(newBibRefs); @@ -1203,11 +1316,11 @@ public Pair>,Document> processPDF(File file, } // mark datasets present in Data Availability section(s) - if (availabilityTokens != null && availabilityTokens.size()>0) + if (availabilityTokens != null && availabilityTokens.size() > 0) entities = markDAS(entities, availabilityTokens); // finally classify the context for predicting the role of the dataset mention - entities = DatasetContextClassifier.getInstance(datastetConfiguration).classifyDocumentContexts(entities); + entities = DatasetContextClassifier.getInstance(datastetConfiguration).classifyDocumentContexts(entities); } catch (Exception e) { //e.printStackTrace(); @@ -1218,11 +1331,11 @@ public Pair>,Document> processPDF(File file, } public List> markDAS(List> entities, List availabilityTokens) { - for(List datasets1 : entities) { - for(Dataset entity1 : datasets1) { + for (List datasets1 : entities) { + for (Dataset entity1 : datasets1) { if (entity1.isInDataAvailabilitySection()) continue; - if (entity1.getContext() == null) + if (entity1.getContext() == null) continue; int context_offset_start = entity1.getGlobalContextOffset(); int context_offset_end = context_offset_start + entity1.getContext().length(); @@ -1237,7 +1350,7 @@ public List> markDAS(List> entities, List>, List> processXML(File file, boolean disambiguate, boolean addParagraphContext) throws IOException { + public Pair>, List> processXML(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException { Pair>, List> resultExtraction = null; try { String tei = processXML(file); @@ -1248,16 +1361,16 @@ public Pair>, List> processXML(File file, boolean org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei))); //document.getDocumentElement().normalize(); - - resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext); + + resultExtraction = processTEIDocument(document, segmentSentences, disambiguate, addParagraphContext); } catch (final Exception exp) { LOGGER.error("An error occured while processing the following XML file: " - + file.getPath(), exp); - } + + file.getPath(), exp); + } return resultExtraction; } - public Pair>, List> processTEI(File file, boolean disambiguate, boolean addParagraphContext) throws IOException { + public Pair>, List> processTEI(File file, boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) throws IOException { Pair>, List> resultExtraction = null; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -1265,22 +1378,22 @@ public Pair>, List> processTEI(File file, boolean DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(file); //document.getDocumentElement().normalize(); - resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext); + resultExtraction = processTEIDocument(document, segmentSentences, disambiguate, addParagraphContext); //tei = restoreDomParserAttributeBug(tei); } catch (final Exception exp) { LOGGER.error("An error occured while processing the following XML file: " - + file.getPath(), exp); - } + + file.getPath(), exp); + } return resultExtraction; } /** * Tranform an XML document (for example JATS) to a TEI document. - * Transformation of the XML/JATS/NLM/etc. document is realised thanks to Pub2TEI - * (https://github.com/kermitt2/pub2tei) - * + * Transformation of the XML/JATS/NLM/etc. document is realised thanks to Pub2TEI + * (https://github.com/kermitt2/pub2tei) + * * @return TEI string */ public String processXML(File file) throws Exception { @@ -1289,9 +1402,9 @@ public String processXML(File file) throws Exception { String newFilePath = null; try { String tmpFilePath = this.datastetConfiguration.getTmpPath(); - newFilePath = ArticleUtilities.applyPub2TEI(file.getAbsolutePath(), - tmpFilePath + "/" + fileName.replace(".xml", ".tei.xml"), - this.datastetConfiguration.getPub2TEIPath()); + newFilePath = ArticleUtilities.applyPub2TEI(file.getAbsolutePath(), + tmpFilePath + "/" + fileName.replace(".xml", ".tei.xml"), + this.datastetConfiguration.getPub2TEIPath()); //System.out.println(newFilePath); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -1310,181 +1423,341 @@ public String processXML(File file) throws Exception { return tei; } + /** - * Extract all software mentions from a publisher XML file + * Process dataset mentions from a TEI XML string */ - public Pair>, List> processTEIDocument(org.w3c.dom.Document doc, - boolean disambiguate, - boolean addParagraphContext) { - List> entities = new ArrayList<>(); - List resCitations = new ArrayList<>(); - - /*List> selectedLayoutTokenSequences = new ArrayList<>(); - List> selectedOriginalLayoutTokenSequences = new ArrayList<>(); - List docLayoutTokens = new ArrayList<>(); - - List>> selectedRefInfos = new ArrayList<>(); - - org.w3c.dom.NodeList paragraphList = doc.getElementsByTagName("p"); - int globalPos = 0; - for (int i = 0; i < paragraphList.getLength(); i++) { - org.w3c.dom.Element paragraphElement = (org.w3c.dom.Element) paragraphList.item(i); - - Pair>> contentTextAndRef = - XMLUtilities.getTextNoRefMarkersAndMarkerPositions(paragraphElement, globalPos); - String contentText = UnicodeUtil.normaliseText(contentTextAndRef.getLeft()); - Map> refInfos = contentTextAndRef.getRight(); - - if (contentText != null && contentText.length()>0) { - List paragraphTokens = - SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(contentText); - String orginalText = UnicodeUtil.normaliseText(paragraphElement.getTextContent()); - List originalParagraphTokens = - SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(orginalText); - if (paragraphTokens != null && paragraphTokens.size() > 0) { - // shift the paragraph tokens to the global position - for(LayoutToken paragraphToken : paragraphTokens) { - paragraphToken.setOffset(paragraphToken.getOffset()+globalPos); - } - for(LayoutToken originalParagraphToken : originalParagraphTokens) { - originalParagraphToken.setOffset(originalParagraphToken.getOffset()+globalPos); - } + public Pair>, List> processTEIDocument(String documentAsString, + boolean segmentSentences, + boolean disambiguate, + boolean addParagraphContext) { + + Pair>, List> tei = null; + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + DocumentBuilder builder = factory.newDocumentBuilder(); + org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(documentAsString))); + //document.getDocumentElement().normalize(); + tei = processTEIDocument(document, segmentSentences, disambiguate, addParagraphContext); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } + return tei; + + } + + /** + * Extract dataset mentions from a TEI XML file + *

+ * LF: This method attempt to reproduce the extraction from PDF in processPDF but with an already extracted TEI as input + */ + public Pair>, List> processTEIDocument(org.w3c.dom.Document doc, + boolean segmentSentences, + boolean disambiguate, + boolean addParagraphContext) { + + List selectedSequences = new ArrayList<>(); + List relevantSectionsNamedDatasets = new ArrayList<>(); + List relevantSectionsImplicitDatasets = new ArrayList<>(); + + //Extract relevant section from the TEI + // Title, abstract, keywords - selectedLayoutTokenSequences.add(paragraphTokens); - docLayoutTokens.addAll(originalParagraphTokens); - - selectedRefInfos.add(refInfos); - selectedOriginalLayoutTokenSequences.add(originalParagraphTokens); + XPath xPath = XPathFactory.newInstance().newXPath(); + + try { + org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate("//*[local-name() = 'titleStmt']/*[local-name() = 'title']", + doc, + XPathConstants.NODE); + if (titleNode == null) { + LOGGER.warn("Title was not founded, skipping."); + } else { + String textTitle = titleNode.getTextContent(); + selectedSequences.add(textTitle); + relevantSectionsNamedDatasets.add(false); + relevantSectionsImplicitDatasets.add(false); + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Title was not founded, skipping."); + } + + try { + String expression = segmentSentences ? "//abstract/div/p" : "//abstract/div/p/s"; + String expressionNoNamespaces = getXPathWithoutNamespaces(expression); + org.w3c.dom.NodeList abstractNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, + doc, + XPathConstants.NODESET); + for (int i = 0; i < abstractNodeList.getLength(); i++) { + org.w3c.dom.Node item = abstractNodeList.item(i); + String abstractSentence = item.getTextContent(); + selectedSequences.add(abstractSentence); + //LF Not clear why true, just copied from around ProcessPDF:578 + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(false); + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Abstract was not founded, skipping."); + } + + try { + String expression = "//keywords/term"; + String expressionNoNamespaces = getXPathWithoutNamespaces(expression); + org.w3c.dom.NodeList keywordsNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, + doc, + XPathConstants.NODESET); + for (int i = 0; i < keywordsNodeList.getLength(); i++) { + org.w3c.dom.Node item = keywordsNodeList.item(i); + String keyword = item.getTextContent(); + selectedSequences.add(keyword); + relevantSectionsNamedDatasets.add(false); + relevantSectionsImplicitDatasets.add(false); + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Keywords was not founded, skipping."); + } + + // Extraction from Body + try { + String expression = segmentSentences ? "//text/body/div/p" : "//text/body/div/p/s"; + String expressionNoNamespaces = getXPathWithoutNamespaces(expression); + org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, + doc, + XPathConstants.NODESET); + for (int i = 0; i < bodyNodeList.getLength(); i++) { + org.w3c.dom.Node item = bodyNodeList.item(i); + String abstractSentence = item.getTextContent(); + selectedSequences.add(abstractSentence); + //LF Not clear why true, just copied from around ProcessPDF:635 + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(true); + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Abstract was not founded, skipping."); + } + + // Various statements (acknowledgement, funding, data availaiblity) + + // funding and acknowledgement at the moment have only paragraphs (Grobid issue # + List sectionTypesOnlyParagraphs = Arrays.asList("acknowledgement", "funding"); + + for (String sectionType : sectionTypesOnlyParagraphs) { + try { + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; + org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); + for (int i = 0; i < annexNodeList.getLength(); i++) { + org.w3c.dom.Node item = annexNodeList.item(i); + String abstractSentence = item.getTextContent(); + selectedSequences.add(abstractSentence); + relevantSectionsNamedDatasets.add(false); + relevantSectionsImplicitDatasets.add(false); } - globalPos += contentText.length(); + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Abstract was not founded, skipping."); } } - processLayoutTokenSequences(selectedLayoutTokenSequences, entities, disambiguate, addParagraphContext, false, true); - selectedLayoutTokenSequences = selectedOriginalLayoutTokenSequences; + // Annex might contain misclassified relevant sections - // filter out components outside context, restore original tokenization - int sequenceIndex = 0; - for(SoftwareEntity entity : entities) { - if (entity.getSoftwareName() != null) { - String context = entity.getContext(); - if (context == null) - continue; - String oldContext = new String(entity.getContext()); - - int paragraphContextOffset = entity.getParagraphContextOffset(); - int globalContextOffset = entity.getGlobalContextOffset(); - SoftwareComponent softwareName = entity.getSoftwareName(); - - //System.out.println(softwareName.getRawForm() + " / " + softwareName.getOffsetStart() + "-" + softwareName.getOffsetEnd() + - // " / global offset: " + globalContextOffset + " / context: " + context + " / final context pos: " + (globalContextOffset+context.length()) ); - - for(int i=sequenceIndex; i selectedLayoutTokenSequence = selectedLayoutTokenSequences.get(i); - if (selectedLayoutTokenSequence != null && selectedLayoutTokenSequence.size()>0) { - posStartSequence = selectedLayoutTokenSequence.get(0).getOffset(); - String localText = LayoutTokensUtil.toText(selectedLayoutTokenSequence); - if (posStartSequence <= globalContextOffset && globalContextOffset context.length() || version.getOffsetEnd() > context.length())) { - entity.setVersion(null); - } - SoftwareComponent creator = entity.getCreator(); - if (creator != null && (creator.getOffsetStart() < 0 || creator.getOffsetEnd() < 0)) { - entity.setCreator(null); - } - if (creator != null && (creator.getOffsetStart() > context.length() || creator.getOffsetEnd() > context.length())) { - entity.setCreator(null); - } - SoftwareComponent url = entity.getSoftwareURL(); - if (url != null && (url.getOffsetStart() < 0 || url.getOffsetEnd() < 0)) { - entity.setSoftwareURL(null); - } - if (url != null && (url.getOffsetStart() > context.length() || url.getOffsetEnd() > context.length())) { - entity.setSoftwareURL(null); - } - SoftwareComponent language = entity.getLanguage(); - if (language != null && (language.getOffsetStart() < 0 || language.getOffsetEnd() < 0)) { - entity.setLanguage(null); - } - if (language != null && (language.getOffsetStart() > context.length() || language.getOffsetEnd() > context.length())) { - entity.setLanguage(null); + if (StringUtils.equals(currentSection, "das")) { + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(true); + } else { + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(false); + } } } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Annex was not founded, skipping."); } - // propagate the disambiguated entities to the non-disambiguated entities corresponding to the same software name - for(SoftwareEntity entity1 : entities) { - if (entity1.getSoftwareName() != null && entity1.getSoftwareName().getWikidataId() != null) { - for (SoftwareEntity entity2 : entities) { - if (entity2.getSoftwareName() != null && entity2.getSoftwareName().getWikidataId() != null) { - // if the entity is already disambiguated, nothing possible - continue; - } - if (entity2.getSoftwareName() != null && - entity2.getSoftwareName().getRawForm().equals(entity1.getSoftwareName().getRawForm())) { - entity1.getSoftwareName().copyKnowledgeInformationTo(entity2.getSoftwareName()); - entity2.getSoftwareName().setLang(entity1.getSoftwareName().getLang()); - } + // availability statement have sentences + DatastetAnalyzer datastetAnalyzer = DatastetAnalyzer.getInstance(); + + List sectionTypesAlsoSentences = Arrays.asList("availability"); + + List availabilityTokens = new ArrayList<>(); + for (String sectionType : sectionTypesAlsoSentences) { + try { + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; + expression = segmentSentences ? expression + "/*[local-name() = 's']" : ""; + org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); + for (int i = 0; i < annexNodeList.getLength(); i++) { + org.w3c.dom.Node item = annexNodeList.item(i); + String abstractSentence = item.getTextContent(); + selectedSequences.add(abstractSentence); + availabilityTokens.addAll(analyzer.tokenizeWithLayoutToken(abstractSentence)); + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(true); } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Availability statement was not found, skipping."); } } + //Footnotes + try { + String expression = "//*[local-name() = 'text']/*[local-name() = 'body']/*[local-name() = 'note'][@*[local-name()='place' and .='foot']]/*[local-name() = 'div']/*[local-name() = 'p']"; + expression = segmentSentences ? expression + "/*[local-name() = 's']" : ""; + org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); + for (int i = 0; i < bodyNodeList.getLength(); i++) { + org.w3c.dom.Node item = bodyNodeList.item(i); + String abstractSentence = item.getTextContent(); + selectedSequences.add(abstractSentence); + //LF Not clear why true, just copied from around ProcessPDF:635 + relevantSectionsNamedDatasets.add(true); + relevantSectionsImplicitDatasets.add(false); + } + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Footnotes were not found or an error was thrown, skipping."); + } - // TODO sentence segmentation and related processing + //Dataset Recognition + + List> entities = new ArrayList<>(); + + List> selectedSequencesLayoutTokens = new ArrayList<>(); + List allDocumentTokens = new ArrayList<>(); + + int startingOffset = 0; + List sentenceOffsetStarts = new ArrayList<>(); + for (String sequence : selectedSequences) { + List sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence); + selectedSequencesLayoutTokens.add(sentenceTokens); + int finalStartingOffset = startingOffset; + List sentenceTokenAllTokens = sentenceTokens.stream() + .map(lt -> { + lt.setOffset(lt.getOffset() + finalStartingOffset); + return lt; + }) + .collect(Collectors.toList()); + + allDocumentTokens.addAll(sentenceTokenAllTokens); + sentenceOffsetStarts.add(startingOffset); + startingOffset += sequence.length(); + } + List> datasetLists = processing(selectedSequencesLayoutTokens, new ArrayList<>(), false); + entities.addAll(datasetLists); + for (int i = 0; i < entities.size(); i++) { + List datasets = entities.get(i); + if (datasets == null) { + continue; + } + for (Dataset dataset : datasets) { + if (dataset == null) + continue; + dataset.setGlobalContextOffset(sentenceOffsetStarts.get(i)); + } + } + + // TODO make sure that selectedSequences == allSentences above in the processPDF? + List dataseerClassificationResults = classifyWithDataseerClassifier(selectedSequences); + + for (int i = 0; i < entities.size(); i++) { + List localDatasets = entities.get(i); + if (CollectionUtils.isNotEmpty(localDatasets)) { + continue; + } + for (Dataset localDataset : localDatasets) { + if (localDataset == null) { + continue; + } + DataseerResults result = dataseerClassificationResults.get(i); + + if (localDataset.getType() == DatasetType.DATASET && (result.getBestType() != null) && localDataset.getDataset() != null) { + localDataset.getDataset().setBestDataType(result.getBestType()); + localDataset.getDataset().setBestDataTypeScore(result.getBestScore()); + localDataset.getDataset().setHasDatasetScore(result.getHasDatasetScore()); + } + } + } - // second pass for document level consistency - // we prepare a matcher for all the identified dataset mention forms + //Dataset consolidation + + // we prepare a matcher for all the identified dataset mention forms FastMatcher termPattern = prepareTermPattern(entities); // we prepare the frequencies for each dataset name in the whole document - Map frequencies = prepareFrequencies(entities, doc.getTokenizations()); + Map frequencies = prepareFrequencies(entities, allDocumentTokens); // we prepare a map for mapping a dataset name with its positions of annotation in the document and its IDF Map termProfiles = prepareTermProfiles(entities); List> placeTaken = preparePlaceTaken(entities); - //System.out.println("entities size: " + entities.size()); - int index = 0; List> newEntities = new ArrayList<>(); - for (List sentenceTokens : allLayoutTokens) { - List localEntities = propagateLayoutTokenSequence(sentenceTokens, - entities.get(index), - termProfiles, - termPattern, - placeTaken.get(index), - frequencies, - sentenceOffsetStarts.get(index)); + for (List sentenceTokens : selectedSequencesLayoutTokens) { + List localEntities = propagateLayoutTokenSequence(sentenceTokens, + entities.get(index), + termProfiles, + termPattern, + placeTaken.get(index), + frequencies, + sentenceOffsetStarts.get(index)); if (localEntities != null) { Collections.sort(localEntities); - + // revisit and attach URL component - localEntities = attachUrlComponents(localEntities, sentenceTokens, allSentences.get(index), pdfAnnotations); + localEntities = attachUrlComponents(localEntities, sentenceTokens, selectedSequences.get(index), new ArrayList<>()); } newEntities.add(localEntities); @@ -1497,348 +1770,217 @@ public Pair>, List> processTEIDocument(org.w3c.do // filter implicit datasets based on selected relevant data section List> filteredEntities = new ArrayList<>(); - index = 0; - for(List localDatasets : entities) { + for (int i = 0; i < entities.size(); i++) { + List localDatasets = entities.get(i); List filteredLocalEntities = new ArrayList<>(); - if (mapSentencesToZones.get(index) == null) { - index++; - continue; - } - - Integer currentZoneObject = mapSentencesToZones.get(index); - if (currentZoneObject == null) { - index++; - continue; - } - - int currentZone = currentZoneObject.intValue(); - for (Dataset localDataset : localDatasets) { boolean referenceDataSource = false; - if (localDataset.getUrl() != null && - DatastetLexicon.getInstance().isDatasetURLorDOI(localDataset.getUrl().getNormalizedForm())) { + if (localDataset.getUrl() != null && + DatastetLexicon.getInstance().isDatasetURLorDOI(localDataset.getUrl().getNormalizedForm())) { referenceDataSource = true; } if (localDataset.getType() == DatasetType.DATASET && - !relevantSectionsImplicitDatasets.get(currentZone) && !referenceDataSource) { + !relevantSectionsImplicitDatasets.get(i) && !referenceDataSource) { continue; - } + } if (localDataset.getType() == DatasetType.DATASET_NAME && - !relevantSectionsNamedDatasets.get(currentZone)) { + !relevantSectionsNamedDatasets.get(i)) { continue; - } + } if (localDataset.getType() == DatasetType.DATASET && - localDataset.getDataset() != null && - localDataset.getDataset().getHasDatasetScore() < 0.5 && !referenceDataSource) { + localDataset.getDataset() != null && + localDataset.getDataset().getHasDatasetScore() < 0.5 && !referenceDataSource) { continue; - } - + } + filteredLocalEntities.add(localDataset); } filteredEntities.add(filteredLocalEntities); - index++; } entities = filteredEntities; - - //Collections.sort(entities); - - // local bibliographical references to spot in the XML mark-up, to attach and propagate - List resCitations = new ArrayList<>(); - org.w3c.dom.NodeList bibList = doc.getElementsByTagName("biblStruct"); - for (int i = 0; i < bibList.getLength(); i++) { - org.w3c.dom.Element biblStructElement = (org.w3c.dom.Element) bibList.item(i); - - // filter not having as father - org.w3c.dom.Node fatherNode = biblStructElement.getParentNode(); - if (fatherNode != null) { - if (!"listBibl".equals(fatherNode.getNodeName())) - continue; - } - - BiblioItem biblio = XMLUtilities.parseTEIBiblioItem(biblStructElement); - - BibDataSet bds = new BibDataSet(); - bds.setResBib(biblio); - bds.setRefSymbol(biblStructElement.getAttribute("xml:id")); - resCitations.add(bds); + // we attach and match bibliographical reference callout +// TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); + // second pass, body + +// if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { +// List bibRefComponents = new ArrayList(); +// for (TaggingTokenCluster cluster : bodyClusters) { +// if (cluster == null) { +// continue; +// } +// +// TaggingLabel clusterLabel = cluster.getTaggingLabel(); +// +// List localTokenization = cluster.concatTokens(); +// if ((localTokenization == null) || (localTokenization.size() == 0)) +// continue; +// +// if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { +// List refTokens = TextUtilities.dehyphenize(localTokenization); +// String chunkRefString = LayoutTokensUtil.toText(refTokens); +// +// List refNodes = formatter.markReferencesTEILuceneBased(refTokens, +// doc.getReferenceMarkerMatcher(), +// true, // generate coordinates +// false); // do not mark unsolved callout as ref +// +// if (refNodes != null) { +// for (Node refNode : refNodes) { +// if (refNode instanceof Element) { +// // get the bib ref key +// String refKey = ((Element) refNode).getAttributeValue("target"); +// +// if (refKey == null) +// continue; +// +// int refKeyVal = -1; +// if (refKey.startsWith("#b")) { +// refKey = refKey.substring(2, refKey.length()); +// try { +// refKeyVal = Integer.parseInt(refKey); +// } catch (Exception e) { +// LOGGER.warn("Invalid ref identifier: " + refKey); +// } +// } +// if (refKeyVal == -1) +// continue; +// +// // get the bibref object +// BibDataSet resBib = resCitations.get(refKeyVal); +// if (resBib != null) { +// BiblioComponent biblioComponent = new BiblioComponent(resBib.getResBib(), refKeyVal); +// biblioComponent.setRawForm(refNode.getValue()); +// biblioComponent.setOffsetStart(refTokens.get(0).getOffset()); +// biblioComponent.setOffsetEnd(refTokens.get(refTokens.size() - 1).getOffset() + +// refTokens.get(refTokens.size() - 1).getText().length()); +// List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); +// biblioComponent.setBoundingBoxes(boundingBoxes); +// bibRefComponents.add(biblioComponent); +// } +// } +// } +// } +// } +// } +// +// +// if (bibRefComponents.size() > 0) { +// // attach references to dataset entities +// entities = attachRefBib(entities, bibRefComponents); +// } +// +// // consolidate the attached ref bib (we don't consolidate all bibliographical references +// // to avoid useless costly computation) +// List citationsToConsolidate = new ArrayList(); +// List consolidated = new ArrayList(); +// for (List datasets : entities) { +// for (Dataset entity : datasets) { +// if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { +// List bibRefs = entity.getBibRefs(); +// for (BiblioComponent bibRef : bibRefs) { +// Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); +// if (!consolidated.contains(refKeyVal)) { +// citationsToConsolidate.add(resCitations.get(refKeyVal)); +// consolidated.add(refKeyVal); +// } +// } +// } +// } +// } +// +// try { +// Consolidation consolidator = Consolidation.getInstance(); +// Map resConsolidation = consolidator.consolidate(citationsToConsolidate); +// for (int j = 0; j < citationsToConsolidate.size(); j++) { +// BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); +// BiblioItem bibo = resConsolidation.get(j); +// if (bibo != null) { +// BiblioItem.correct(resCitation, bibo); +// } +// } +// } catch (Exception e) { +// throw new GrobidException( +// "An exception occured while running consolidation on bibliographical references.", e); +// } +// +// // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. +// for (List datasets1 : entities) { +// for (Dataset entity1 : datasets1) { +// if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { +// for (List datasets2 : entities) { +// for (Dataset entity2 : datasets2) { +// if (entity2.getBibRefs() != null) { +// continue; +// } +// if ((entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && +// entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && +// (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || +// entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) +// ) { +// List newBibRefs = new ArrayList<>(); +// for (BiblioComponent bibComponent : entity1.getBibRefs()) { +// newBibRefs.add(new BiblioComponent(bibComponent)); +// } +// entity2.setBibRefs(newBibRefs); +// } +// } +// } +// } +// } +// } +// } + + // mark datasets present in Data Availability section(s) + if (CollectionUtils.isNotEmpty(availabilityTokens)) { + entities = markDAS(entities, availabilityTokens); } - entities = attachReferencesXML(entities, - selectedRefInfos, - resCitations); - - // consolidate the attached ref bib (we don't consolidate all bibliographical references - // to avoid useless costly computation) - List citationsToConsolidate = new ArrayList(); - List consolidated = new ArrayList(); - for(SoftwareEntity entity : entities) { - if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { - List bibRefs = entity.getBibRefs(); - for(BiblioComponent bibRef: bibRefs) { - Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); - if (!consolidated.contains(refKeyVal)) { - citationsToConsolidate.add(resCitations.get(refKeyVal)); - consolidated.add(refKeyVal); - } - } - } - } - - try { - Consolidation consolidator = Consolidation.getInstance(); - Map resConsolidation = consolidator.consolidate(citationsToConsolidate); - for(int i=0; i0) { - for(SoftwareEntity entity1 : entities) { - if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { - for (SoftwareEntity entity2 : entities) { - if (entity2.getBibRefs() != null) { - continue; - } - if (entity2.getSoftwareName() != null && - entity2.getSoftwareName().getRawForm().equals(entity1.getSoftwareName().getRawForm())) { - List newBibRefs = new ArrayList<>(); - for(BiblioComponent bibComponent : entity1.getBibRefs()) { - newBibRefs.add(new BiblioComponent(bibComponent)); - } - entity2.setBibRefs(newBibRefs); - } - } - } - } - } - - Collections.sort(entities); - - try{ - // filter implicit datasets based on selected relevant data section - List> filteredEntities = new ArrayList<>(); - index = 0; - for(List localDatasets : entities) { - List filteredLocalEntities = new ArrayList<>(); - - if (mapSentencesToZones.get(index) == null) { - index++; - continue; - } - - Integer currentZoneObject = mapSentencesToZones.get(index); - if (currentZoneObject == null) { - index++; - continue; - } - - int currentZone = currentZoneObject.intValue(); - - for (Dataset localDataset : localDatasets) { - boolean referenceDataSource = false; - if (localDataset.getUrl() != null && - DatastetLexicon.getInstance().isDatasetURLorDOI(localDataset.getUrl().getNormalizedForm())) { - referenceDataSource = true; - } - - if (localDataset.getType() == DatasetType.DATASET && - !relevantSectionsImplicitDatasets.get(currentZone) && !referenceDataSource) { - continue; - } - - if (localDataset.getType() == DatasetType.DATASET_NAME && - !relevantSectionsNamedDatasets.get(currentZone)) { - continue; - } - - if (localDataset.getType() == DatasetType.DATASET && - localDataset.getDataset() != null && - localDataset.getDataset().getHasDatasetScore() < 0.5 && !referenceDataSource) { - continue; - } - - filteredLocalEntities.add(localDataset); - } - - filteredEntities.add(filteredLocalEntities); - index++; - } - entities = filteredEntities; - - // we attach and match bibliographical reference callout - TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); - // second pass, body - if ( (bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0) ) { - List bibRefComponents = new ArrayList(); - for (TaggingTokenCluster cluster : bodyClusters) { - if (cluster == null) { - continue; - } - - TaggingLabel clusterLabel = cluster.getTaggingLabel(); - - List localTokenization = cluster.concatTokens(); - if ((localTokenization == null) || (localTokenization.size() == 0)) - continue; - - if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { - List refTokens = TextUtilities.dehyphenize(localTokenization); - String chunkRefString = LayoutTokensUtil.toText(refTokens); - - List refNodes = formatter.markReferencesTEILuceneBased(refTokens, - doc.getReferenceMarkerMatcher(), - true, // generate coordinates - false); // do not mark unsolved callout as ref - - if (refNodes != null) { - for (nu.xom.Node refNode : refNodes) { - if (refNode instanceof Element) { - // get the bib ref key - String refKey = ((Element)refNode).getAttributeValue("target"); - - if (refKey == null) - continue; - - int refKeyVal = -1; - if (refKey.startsWith("#b")) { - refKey = refKey.substring(2, refKey.length()); - try { - refKeyVal = Integer.parseInt(refKey); - } catch(Exception e) { - LOGGER.warn("Invalid ref identifier: " + refKey); - } - } - if (refKeyVal == -1) - continue; - - // get the bibref object - BibDataSet resBib = resCitations.get(refKeyVal); - if (resBib != null) { - BiblioComponent biblioComponent = new BiblioComponent(resBib.getResBib(), refKeyVal); - biblioComponent.setRawForm(refNode.getValue()); - biblioComponent.setOffsetStart(refTokens.get(0).getOffset()); - biblioComponent.setOffsetEnd(refTokens.get(refTokens.size()-1).getOffset() + - refTokens.get(refTokens.size()-1).getText().length()); - List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); - biblioComponent.setBoundingBoxes(boundingBoxes); - bibRefComponents.add(biblioComponent); - } - } - } - } - } - } + // finally classify the context for predicting the role of the dataset mention + entities = DatasetContextClassifier.getInstance(datastetConfiguration) + .classifyDocumentContexts(entities); + List resCitations = List.of(); + return Pair.of(entities, resCitations); + } - if (bibRefComponents.size() > 0) { - // attach references to dataset entities - entities = attachRefBib(entities, bibRefComponents); - } - - // consolidate the attached ref bib (we don't consolidate all bibliographical references - // to avoid useless costly computation) - List citationsToConsolidate = new ArrayList(); - List consolidated = new ArrayList(); - for(List datasets : entities) { - for(Dataset entity : datasets) { - if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { - List bibRefs = entity.getBibRefs(); - for(BiblioComponent bibRef: bibRefs) { - Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); - if (!consolidated.contains(refKeyVal)) { - citationsToConsolidate.add(resCitations.get(refKeyVal)); - consolidated.add(refKeyVal); - } - } - } - } - } - - try { - Consolidation consolidator = Consolidation.getInstance(); - Map resConsolidation = consolidator.consolidate(citationsToConsolidate); - for(int j=0; j datasets1 : entities) { - for(Dataset entity1 : datasets1) { - if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { - for(List datasets2 : entities) { - for(Dataset entity2 : datasets2) { - if (entity2.getBibRefs() != null) { - continue; - } - if ( (entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && - entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && - (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || - entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) - ) { - List newBibRefs = new ArrayList<>(); - for(BiblioComponent bibComponent : entity1.getBibRefs()) { - newBibRefs.add(new BiblioComponent(bibComponent)); - } - entity2.setBibRefs(newBibRefs); - } - } - } - } - } - } + public static String getXPathWithoutNamespaces(String s) { + StringBuilder sb = new StringBuilder(); + for (String item : s.split("/")) { + if (item.isEmpty()) { + sb.append("/"); + } else { + sb.append("/*[local-name() = '").append(item).append("']"); } + } + String output = sb.toString().replaceAll("^///", "//"); - // finally classify the context for predicting the role of the dataset mention - entities = DatasetContextClassifier.getInstance(datastetConfiguration).classifyDocumentContexts(entities); - - } catch (Exception e) { - //e.printStackTrace(); - throw new GrobidException("Cannot process pdf file: " + file.getPath(), e); - }*/ - - return Pair.of(entities, resCitations); + return output; } /** * Process with the dataset model a set of arbitrary sequence of LayoutTokenization - */ - private List> processLayoutTokenSequences(List> layoutTokenList, - List> entities, - List sentenceOffsetStarts, - List pdfAnnotations, - boolean disambiguate) { + */ + private List> processLayoutTokenSequences(List> layoutTokenList, + List> entities, + List sentenceOffsetStarts, + List pdfAnnotations, + boolean disambiguate) { List> results = processing(layoutTokenList, pdfAnnotations, disambiguate); entities.addAll(results); int i = 0; - for(List datasets : entities) { + for (List datasets : entities) { if (datasets == null) { i++; continue; } - for(Dataset dataset : datasets) { + for (Dataset dataset : datasets) { if (dataset == null) continue; dataset.setGlobalContextOffset(sentenceOffsetStarts.get(i)); @@ -1854,8 +1996,16 @@ public static boolean isNewParagraph(TaggingLabel lastClusterLabel) { && lastClusterLabel != TaggingLabels.TABLE); } + public static boolean checkAuthorAnnex(String sectionHead) { + return StringUtils.startsWithIgnoreCase(sectionHead, "author"); + } + + public static boolean checkAbbreviationAnnex(String sectionHead) { + return StringUtils.startsWithIgnoreCase(sectionHead, "abbreviation"); + } + public static boolean checkAuthorAnnex(List annexTokens) { - for(int i=0; i annexTokens) { } public static boolean checkAbbreviationAnnex(List annexTokens) { - for(int i=0; i annexTokens) { return false; } + public static boolean checkDASAnnex(String sectionHead) { + boolean dataFound = false; + boolean availabilityFound = false; + if (StringUtils.containsIgnoreCase(sectionHead, "data") + || StringUtils.containsIgnoreCase(sectionHead, "code")) { + dataFound = true; + } + if (StringUtils.containsIgnoreCase(sectionHead, "availab") + || StringUtils.containsIgnoreCase(sectionHead, "sharing")) { + availabilityFound = true; + } + + if (dataFound && availabilityFound) + return true; + + return false; + } + public static boolean checkDASAnnex(List annexTokens) { boolean dataFound = false; boolean availabilityFound = false; - for(int i=0; i> attachRefBib(List> entities, List datasets : entities) { - for(Dataset entity : datasets) { + for (List datasets : entities) { + for (Dataset entity : datasets) { if (entity.getDatasetName() == null) continue; @@ -1914,57 +2082,57 @@ public List> attachRefBib(List> entities, List= pos) && - (refBib.getOffsetStart() <= endPos+5) ) { + if ((refBib.getOffsetStart() >= pos) && + (refBib.getOffsetStart() <= endPos + 5)) { entity.addBibRef(refBib); endPos = refBib.getOffsetEnd(); } } } } - + return entities; } public List> preparePlaceTaken(List> entities) { List> localPositions = new ArrayList<>(); - for(List datasets : entities) { + for (List datasets : entities) { List localSentencePositions = new ArrayList<>(); - for(Dataset entity : datasets) { + for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent != null) { List localTokens = nameComponent.getTokens(); - localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), - localTokens.get(localTokens.size()-1).getOffset() + localTokens.get(localTokens.size()-1).getText().length()-1)); + localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), + localTokens.get(localTokens.size() - 1).getOffset() + localTokens.get(localTokens.size() - 1).getText().length() - 1)); DatasetComponent publisherComponent = entity.getPublisher(); if (publisherComponent != null) { localTokens = publisherComponent.getTokens(); if (localTokens.size() > 0) { - localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), - localTokens.get(localTokens.size()-1).getOffset() + localTokens.get(localTokens.size()-1).getText().length()-1)); + localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), + localTokens.get(localTokens.size() - 1).getOffset() + localTokens.get(localTokens.size() - 1).getText().length() - 1)); } } } nameComponent = entity.getDataset(); if (nameComponent != null) { List localTokens = nameComponent.getTokens(); - localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), - localTokens.get(localTokens.size()-1).getOffset() + localTokens.get(localTokens.size()-1).getText().length()-1)); + localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), + localTokens.get(localTokens.size() - 1).getOffset() + localTokens.get(localTokens.size() - 1).getText().length() - 1)); DatasetComponent deviceComponent = entity.getDataDevice(); if (deviceComponent != null) { localTokens = deviceComponent.getTokens(); if (localTokens.size() > 0) { - localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), - localTokens.get(localTokens.size()-1).getOffset() + localTokens.get(localTokens.size()-1).getText().length()-1)); + localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), + localTokens.get(localTokens.size() - 1).getOffset() + localTokens.get(localTokens.size() - 1).getText().length() - 1)); } } } @@ -1972,8 +2140,8 @@ public List> preparePlaceTaken(List> entities if (urlComponent != null) { List localTokens = urlComponent.getTokens(); if (localTokens.size() > 0) { - localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), - localTokens.get(localTokens.size()-1).getOffset() + localTokens.get(localTokens.size()-1).getText().length()-1)); + localSentencePositions.add(new OffsetPosition(localTokens.get(0).getOffset(), + localTokens.get(localTokens.size() - 1).getOffset() + localTokens.get(localTokens.size() - 1).getText().length() - 1)); } } } @@ -1985,8 +2153,8 @@ public List> preparePlaceTaken(List> entities public Map prepareTermProfiles(List> entities) { Map result = new TreeMap(); - for(List datasets : entities) { - for(Dataset entity : datasets) { + for (List datasets : entities) { + for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent == null) continue; @@ -2018,14 +2186,14 @@ public Map prepareTermProfiles(List> entities) { } if (term.endsWith("dataset") || term.endsWith("Dataset")) { - String termAlt = term+"s"; + String termAlt = term + "s"; profile = result.get(termAlt); if (profile == null) { profile = DatastetLexicon.getInstance().getTermIDF(termAlt); result.put(termAlt, profile); } } else if (term.endsWith("datasets") || term.endsWith("Datasets")) { - String termAlt = term.substring(0,term.length()-1); + String termAlt = term.substring(0, term.length() - 1); profile = result.get(termAlt); if (profile == null) { profile = DatastetLexicon.getInstance().getTermIDF(termAlt); @@ -2044,13 +2212,13 @@ public Map prepareTermProfiles(List> entities) { } return result; - } + } public FastMatcher prepareTermPattern(List> entities) { FastMatcher termPattern = new FastMatcher(); List added = new ArrayList<>(); - for(List datasets : entities) { - for(Dataset entity : datasets) { + for (List datasets : entities) { + for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent == null) continue; @@ -2062,9 +2230,9 @@ public FastMatcher prepareTermPattern(List> entities) { continue; // for safety, we don't propagate something that looks like a stopword with simply an Uppercase first letter - if (FeatureFactory.getInstance().test_first_capital(term) && - !FeatureFactory.getInstance().test_all_capital(term) && - DatastetLexicon.getInstance().isEnglishStopword(term.toLowerCase()) ) { + if (FeatureFactory.getInstance().test_first_capital(term) && + !FeatureFactory.getInstance().test_all_capital(term) && + DatastetLexicon.getInstance().isEnglishStopword(term.toLowerCase())) { continue; } @@ -2087,22 +2255,22 @@ public FastMatcher prepareTermPattern(List> entities) { termPattern.loadTerm(termCleaned, DatastetAnalyzer.getInstance(), false); added.add(termCleaned); } - + // add common trivial variant singular/plurial if (term.endsWith("dataset") || term.endsWith("Dataset")) { - String termAlt = term+"s"; + String termAlt = term + "s"; if (!added.contains(termAlt)) { termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false); added.add(termAlt); } } else if (term.endsWith("datasets") || term.endsWith("Datasets")) { - String termAlt = term.substring(0,term.length()-1); + String termAlt = term.substring(0, term.length() - 1); if (!added.contains(termAlt)) { termPattern.loadTerm(termAlt, DatastetAnalyzer.getInstance(), false); added.add(termAlt); } } - + if (!term.equals(nameComponent.getNormalizedForm())) { if (!added.contains(nameComponent.getNormalizedForm())) { termPattern.loadTerm(nameComponent.getNormalizedForm(), DatastetAnalyzer.getInstance(), false); @@ -2116,8 +2284,8 @@ public FastMatcher prepareTermPattern(List> entities) { public Map prepareFrequencies(List> entities, List tokens) { Map frequencies = new TreeMap(); - for(List datasets : entities) { - for(Dataset entity : datasets) { + for (List datasets : entities) { + for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent == null) continue; @@ -2128,7 +2296,7 @@ public Map prepareFrequencies(List> entities, Lis List results = localTermPattern.matchLayoutToken(tokens, true, true); // ignore delimiters, but case sensitive matching int freq = 0; - if (results != null) { + if (results != null) { freq = results.size(); } frequencies.put(term, Integer.valueOf(freq)); @@ -2138,32 +2306,32 @@ public Map prepareFrequencies(List> entities, Lis return frequencies; } - public List propagateLayoutTokenSequence(List layoutTokens, - List entities, - Map termProfiles, - FastMatcher termPattern, - List placeTaken, - Map frequencies, - int sentenceOffsetStart) { + public List propagateLayoutTokenSequence(List layoutTokens, + List entities, + Map termProfiles, + FastMatcher termPattern, + List placeTaken, + Map frequencies, + int sentenceOffsetStart) { List results = termPattern.matchLayoutToken(layoutTokens, true, true); // above: do not ignore delimiters and case sensitive matching - if ( (results == null) || (results.size() == 0) ) { + if (CollectionUtils.isEmpty(results)) { return entities; } String localText = LayoutTokensUtil.toText(layoutTokens); //System.out.println(results.size() + " results for: " + localText); - for(OffsetPosition position : results) { + for (OffsetPosition position : results) { // the match positions are expressed relative to the local layoutTokens index, while the offset at // token level are expressed relative to the complete doc positions in characters - List matchedTokens = layoutTokens.subList(position.start, position.end+1); - + List matchedTokens = layoutTokens.subList(position.start, position.end + 1); + // we recompute matched position using local tokens (safer than using doc level offsets) int matchedPositionStart = 0; - for(int i=0; i < position.start; i++) { + for (int i = 0; i < position.start; i++) { LayoutToken theToken = layoutTokens.get(i); if (theToken.getText() == null) continue; @@ -2171,12 +2339,12 @@ public List propagateLayoutTokenSequence(List layoutTokens } String term = LayoutTokensUtil.toText(matchedTokens); - OffsetPosition matchedPosition = new OffsetPosition(matchedPositionStart, matchedPositionStart+term.length()); + OffsetPosition matchedPosition = new OffsetPosition(matchedPositionStart, matchedPositionStart + term.length()); // this positions is expressed at document-level, to check if we have not matched something already recognized OffsetPosition rawMatchedPosition = new OffsetPosition( - matchedTokens.get(0).getOffset(), - matchedTokens.get(matchedTokens.size()-1).getOffset() + matchedTokens.get(matchedTokens.size()-1).getText().length() + matchedTokens.get(0).getOffset(), + matchedTokens.get(matchedTokens.size() - 1).getOffset() + matchedTokens.get(matchedTokens.size() - 1).getText().length() ); int termFrequency = 1; @@ -2185,7 +2353,7 @@ public List propagateLayoutTokenSequence(List layoutTokens // check the tf-idf of the term double tfidf = -1.0; - + // is the match already present in the entity list? if (overlapsPosition(placeTaken, rawMatchedPosition)) { continue; @@ -2197,7 +2365,7 @@ public List propagateLayoutTokenSequence(List layoutTokens // ideally we should make a small classifier here with entity frequency, tfidf, disambiguation success and // and/or log-likelyhood/dice coefficient as features - but for the time being we introduce a simple rule // with an experimentally defined threshold: - if ( (tfidf <= 0) || (tfidf > 0.001) ) { + if ((tfidf <= 0) || (tfidf > 0.001)) { // add new entity mention DatasetComponent name = new DatasetComponent(); name.setRawForm(term); @@ -2217,7 +2385,7 @@ public List propagateLayoutTokenSequence(List layoutTokens //entity.setType(DatastetLexicon.Dataset_Type.DATASET); entity.setPropagated(true); entity.setGlobalContextOffset(sentenceOffsetStart); - if (entities == null) + if (entities == null) entities = new ArrayList<>(); entities.add(entity); } @@ -2227,15 +2395,15 @@ public List propagateLayoutTokenSequence(List layoutTokens private boolean overlapsPosition(final List list, final OffsetPosition position) { for (OffsetPosition pos : list) { - if (pos.start == position.start) + if (pos.start == position.start) return true; - if (pos.end == position.end) + if (pos.end == position.end) return true; - if (position.start <= pos.start && pos.start <= position.end) + if (position.start <= pos.start && pos.start <= position.end) return true; - if (pos.start <= position.start && position.start <= pos.end) + if (pos.start <= position.start && position.start <= pos.end) return true; - } + } return false; } @@ -2245,7 +2413,7 @@ public static List characterPositionsUrlPattern(List characterPositionsUrlPattern(List urlTokens = new ArrayList<>(); int tokenPos = 0; int tokenIndex = 0; - for(LayoutToken localToken : layoutTokens) { - if (startPos <= tokenPos && (tokenPos+localToken.getText().length() <= endPos) ) { + for (LayoutToken localToken : layoutTokens) { + if (startPos <= tokenPos && (tokenPos + localToken.getText().length() <= endPos)) { urlTokens.add(localToken); if (startTokenIndex == -1) startTokenIndex = tokenIndex; @@ -2276,13 +2444,13 @@ public static List characterPositionsUrlPattern(List0) { - LayoutToken lastToken = urlTokens.get(urlTokens.size()-1); + if (urlTokens.size() > 0) { + LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); if (pdfAnnotations != null) { for (PDFAnnotation pdfAnnotation : pdfAnnotations) { - if (pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI) { + if (pdfAnnotation.getType() != null && pdfAnnotation.getType() == Type.URI) { if (pdfAnnotation.cover(lastToken)) { - //System.out.println("found overlapping PDF annotation for URL: " + pdfAnnotation.getDestination()); + //System.out.println("found overlapping PDF annotation for URL: " + pdfAnnotation.getDestination()); targetAnnotation = pdfAnnotation; break; } @@ -2296,16 +2464,16 @@ public static List characterPositionsUrlPattern(List characterPositionsUrlPattern(List characterPositionsUrlPattern(List attachUrlComponents(List datasets, - List tokens, - String sentenceString, + public List attachUrlComponents(List datasets, + List tokens, + String sentenceString, List pdfAnnotations) { // revisit url including propagated dataset names if (datasets == null || datasets.size() == 0) { return datasets; } - for(Dataset dataset : datasets) { + for (Dataset dataset : datasets) { if (dataset == null) continue; @@ -2354,17 +2522,17 @@ public List attachUrlComponents(List datasets, } List localDatasetcomponents = new ArrayList<>(); - for(Dataset dataset : datasets) { + for (Dataset dataset : datasets) { if (dataset.getDataset() != null) localDatasetcomponents.add(dataset.getDataset()); if (dataset.getDatasetName() != null) localDatasetcomponents.add(dataset.getDatasetName()); if (dataset.getDataDevice() != null) localDatasetcomponents.add(dataset.getDataDevice()); - if (dataset.getPublisher() != null) + if (dataset.getPublisher() != null) localDatasetcomponents.add(dataset.getPublisher()); if (dataset.getBibRefs() != null) { - for(BiblioComponent biblio : dataset.getBibRefs()) { + for (BiblioComponent biblio : dataset.getBibRefs()) { localDatasetcomponents.add(biblio); } } @@ -2379,28 +2547,29 @@ public List attachUrlComponents(List datasets, while (localDatasetcomponents.size() - sizeBefore > 0) { DatasetComponent previousComponent = null; DatasetComponent urlComponent = null; - for(DatasetComponent localDatasetcomponent : localDatasetcomponents) { + for (DatasetComponent localDatasetcomponent : localDatasetcomponents) { if (localDatasetcomponent.getType() == DatasetType.URL && previousComponent != null) { urlComponent = localDatasetcomponent; break; - } + } - if (localDatasetcomponent.getType() == DatasetType.DATASET_NAME || localDatasetcomponent.getType() == DatasetType.DATASET) + if (localDatasetcomponent.getType() == DatasetType.DATASET_NAME || localDatasetcomponent.getType() == DatasetType.DATASET) previousComponent = localDatasetcomponent; } - if (previousComponent != null && urlComponent != null) {; + if (previousComponent != null && urlComponent != null) { + ; // URL attachment - for(Dataset dataset : datasets) { + for (Dataset dataset : datasets) { if (dataset.getDataset() != null && previousComponent.getType() == DatasetType.DATASET) { - if (dataset.getDataset().getOffsetStart() == previousComponent.getOffsetStart() && - dataset.getDataset().getOffsetEnd() == previousComponent.getOffsetEnd()) { + if (dataset.getDataset().getOffsetStart() == previousComponent.getOffsetStart() && + dataset.getDataset().getOffsetEnd() == previousComponent.getOffsetEnd()) { dataset.setUrl(urlComponent); break; } } else if (dataset.getDatasetName() != null && previousComponent.getType() == DatasetType.DATASET_NAME) { - if (dataset.getDatasetName().getOffsetStart() == previousComponent.getOffsetStart() && - dataset.getDatasetName().getOffsetEnd() == previousComponent.getOffsetEnd()) { + if (dataset.getDatasetName().getOffsetStart() == previousComponent.getOffsetStart() && + dataset.getDatasetName().getOffsetEnd() == previousComponent.getOffsetEnd()) { dataset.setUrl(urlComponent); break; } diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index fc3a68d..e5e9c2b 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -1,42 +1,36 @@ package org.grobid.core.utilities; -import java.io.*; -import javax.xml.parsers.*; -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.namespace.NamespaceContext; -import javax.xml.xpath.*; - -import java.util.Iterator; -import java.util.List; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Map; -import java.util.TreeMap; - -import javax.xml.parsers.*; -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.namespace.NamespaceContext; -import javax.xml.xpath.*; - -import org.w3c.dom.*; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.StringUtils; - -import org.grobid.core.document.xml.XmlBuilderUtils; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.data.BiblioItem; import org.grobid.core.sax.BiblStructSaxHandler; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.*; /** * Some convenient methods for suffering a bit less with XML. diff --git a/src/main/java/org/grobid/service/controller/DatastetController.java b/src/main/java/org/grobid/service/controller/DatastetController.java index 850539e..376f470 100644 --- a/src/main/java/org/grobid/service/controller/DatastetController.java +++ b/src/main/java/org/grobid/service/controller/DatastetController.java @@ -42,6 +42,7 @@ public class DatastetController implements DatastetPaths { private static final String INPUT = "input"; private static final String JSON = "json"; private static final String ADD_PARAGRAPH_CONTEXT = "addParagraphContext"; + private static final String SEGMENT_SENTENCES = "segmentSentences"; private DatastetConfiguration configuration; @@ -107,6 +108,20 @@ public Response processDatasetPDF(@FormDataParam(INPUT) InputStream inputStream, return DatastetProcessFile.processDatasetPDF(inputStream, addParagraphContextBoolean); } + @Path(PATH_DATASET_TEI) + @Consumes(MediaType.MULTIPART_FORM_DATA) + @Produces(MediaType.APPLICATION_JSON) + @POST + public Response processDatasetTEI( + @FormDataParam(INPUT) InputStream inputStream, + @DefaultValue("0") @FormDataParam(SEGMENT_SENTENCES) String segmentSentences, + @DefaultValue("0") @FormDataParam(ADD_PARAGRAPH_CONTEXT) String addParagraphContext + ) { + boolean addParagraphContextBoolean = DatastetServiceUtils.validateBooleanRawParam(addParagraphContext); + boolean segmentSentencesBoolean = DatastetServiceUtils.validateBooleanRawParam(segmentSentences); + return DatastetProcessFile.processDatasetTEI(inputStream, segmentSentencesBoolean, addParagraphContextBoolean); + } + @Path(PATH_DATASEER_TEI) @Consumes(MediaType.MULTIPART_FORM_DATA) @Produces(MediaType.APPLICATION_XML) @@ -114,7 +129,8 @@ public Response processDatasetPDF(@FormDataParam(INPUT) InputStream inputStream, public Response processTEI( @FormDataParam(INPUT) InputStream inputStream, @FormDataParam("segmentSentences") String segmentSentences) { - return DatastetProcessFile.processTEI(inputStream, segmentSentences); + boolean segmentSentencesBoolean = DatastetServiceUtils.validateBooleanRawParam(segmentSentences); + return DatastetProcessFile.processTEI(inputStream, segmentSentencesBoolean); } @Path(PATH_DATASEER_JATS) diff --git a/src/main/java/org/grobid/service/controller/DatastetPaths.java b/src/main/java/org/grobid/service/controller/DatastetPaths.java index 49ca6f0..c6ac6ec 100644 --- a/src/main/java/org/grobid/service/controller/DatastetPaths.java +++ b/src/main/java/org/grobid/service/controller/DatastetPaths.java @@ -27,6 +27,8 @@ public interface DatastetPaths { */ public static final String PATH_DATASET_PDF = "annotateDatasetPDF"; + + public static final String PATH_DATASET_TEI = "processDatasetTEI"; /** * path extension for classifying a textual sentence input. */ diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index 3cc83d8..f8c4790 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -1,51 +1,35 @@ package org.grobid.service.controller; +import com.fasterxml.jackson.core.io.JsonStringEncoder; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.inject.Inject; import com.google.inject.Singleton; - import org.apache.commons.lang3.StringUtils; -import org.grobid.core.document.Document; +import org.apache.commons.lang3.tuple.Pair; +import org.grobid.core.data.BibDataSet; import org.grobid.core.data.Dataset; +import org.grobid.core.document.Document; import org.grobid.core.engines.DataseerClassifier; -import org.grobid.core.engines.Engine; -import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.DatasetParser; -import org.grobid.core.factory.GrobidFactory; import org.grobid.core.layout.Page; -import org.grobid.core.utilities.IOUtilities; import org.grobid.core.utilities.ArticleUtilities; -import org.grobid.core.utilities.DatastetUtilities; import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.utilities.IOUtilities; import org.grobid.service.exceptions.DatastetServiceException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.Status; import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MediaType; - +import javax.ws.rs.core.Response; +import javax.ws.rs.core.Response.Status; +import javax.xml.bind.DatatypeConverter; import java.io.File; import java.io.InputStream; -import java.util.List; -import java.util.ArrayList; -import java.util.NoSuchElementException; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - import java.security.DigestInputStream; import java.security.MessageDigest; -import javax.xml.bind.DatatypeConverter; - -import org.grobid.core.data.BibDataSet; -import org.grobid.core.data.BiblioComponent; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.NoSuchElementException; /** * @@ -69,9 +53,8 @@ public DatastetProcessFile() { * @param inputStream the data of origin TEI document * @return a response object which contains an enriched TEI representation of the document */ - public static Response processTEI(final InputStream inputStream, String segmentSentences) { + public static Response processTEI(final InputStream inputStream, boolean segmentSentences) { LOGGER.debug(methodLogIn()); - boolean segmentSentencesBool = validateTrueFalseParam(segmentSentences); String retVal = null; Response response = null; File originFile = null; @@ -85,7 +68,7 @@ public static Response processTEI(final InputStream inputStream, String segmentS } // starts conversion process - retVal = classifier.processTEI(originFile.getAbsolutePath(), segmentSentencesBool, false); + retVal = classifier.processTEI(originFile.getAbsolutePath(), segmentSentences, false); if (!isResultOK(retVal)) { response = Response.status(Response.Status.NO_CONTENT).build(); @@ -337,7 +320,7 @@ public static Response extractXML(final InputStream inputStream, } else { long start = System.currentTimeMillis(); - Pair>, List> extractionResult = parser.processXML(originFile, false, addParagraphContext); + Pair>, List> extractionResult = parser.processXML(originFile, false, false, addParagraphContext); long end = System.currentTimeMillis(); List> extractedEntities = null; @@ -368,12 +351,12 @@ public static Response extractXML(final InputStream inputStream, json.append("], \"references\":["); - if (extractionResult != null) { - List bibDataSet = extractionResult.getRight(); - if (bibDataSet != null && bibDataSet.size()>0) { - DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); - } - } +// if (extractionResult != null) { +// List bibDataSet = extractionResult.getRight(); +// if (bibDataSet != null && bibDataSet.size()>0) { +// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); +// } +// } json.append("]"); @@ -414,10 +397,12 @@ public static Response extractXML(final InputStream inputStream, * Uploads the origin TEI XML, process it and return the extracted dataset mention objects in JSON. * * @param inputStream the data of origin TEI + * @param segmentSentences add sentence segmentation if the TEI was not already segmented * @param addParagraphContext if true, the full paragraph where an annotation takes place is added * @return a response object containing the JSON annotations */ - public static Response extractTEI(final InputStream inputStream, + public static Response extractTEI(final InputStream inputStream, + boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) { LOGGER.debug(methodLogIn()); @@ -440,7 +425,7 @@ public static Response extractTEI(final InputStream inputStream, response = Response.status(Status.INTERNAL_SERVER_ERROR).build(); } else { long start = System.currentTimeMillis(); - Pair>, List> extractionResult = parser.processTEI(originFile, disambiguate, addParagraphContext); + Pair>, List> extractionResult = parser.processTEI(originFile, segmentSentences, disambiguate, addParagraphContext); long end = System.currentTimeMillis(); List> extractedEntities = null; @@ -469,12 +454,12 @@ public static Response extractTEI(final InputStream inputStream, } json.append("], \"references\":["); - if (extractionResult != null) { - List bibDataSet = extractionResult.getRight(); - if (bibDataSet != null && bibDataSet.size()>0) { - DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); - } - } +// if (extractionResult != null) { +// List bibDataSet = extractionResult.getRight(); +// if (bibDataSet != null && bibDataSet.size()>0) { +// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); +// } +// } float runtime = ((float)(end-start)/1000); json.append(", \"runtime\": "+ runtime); @@ -532,4 +517,7 @@ public static boolean isResultOK(String result) { return StringUtils.isBlank(result) ? false : true; } + public static Response processDatasetTEI(InputStream inputStream, boolean segmentSentences, boolean addParagraphContextBoolean) { + return extractTEI(inputStream, segmentSentences, false, addParagraphContextBoolean); + } } diff --git a/src/test/java/org/grobid/core/engines/DatasetParserTest.java b/src/test/java/org/grobid/core/engines/DatasetParserTest.java new file mode 100644 index 0000000..dd6eec0 --- /dev/null +++ b/src/test/java/org/grobid/core/engines/DatasetParserTest.java @@ -0,0 +1,18 @@ +package org.grobid.core.engines; + +import junit.framework.TestCase; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +public class DatasetParserTest extends TestCase { + + @Test + public void testGetXPathWithoutNamespaces() { + String output = DatasetParser.getXPathWithoutNamespaces("//abstract/p/s"); + + assertThat(output, is("//*[local-name() = 'abstract']/*[local-name() = 'p']/*[local-name() = 's']")); + } + +} \ No newline at end of file From d66862508810ff856be25cfc7b24ca95aeb06872 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 26 Apr 2024 14:17:28 +0800 Subject: [PATCH 12/46] fix output JSON streaming --- resources/config/config.yml | 16 ++++++++-------- .../service/controller/DatastetProcessFile.java | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/resources/config/config.yml b/resources/config/config.yml index a0ab98f..3bb04a3 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -154,18 +154,18 @@ logging: com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" appenders: - type: console - threshold: WARN + threshold: INFO timeZone: UTC # uncomment to have the logs in json format #layout: # type: json - - type: file - currentLogFilename: logs/datastet-service.log - threshold: INFO - archive: true - archivedLogFilenamePattern: logs/datastet-service-%d.log - archivedFileCount: 7 - timeZone: UTC +# - type: file +# currentLogFilename: logs/datastet-service.log +# threshold: INFO +# archive: true +# archivedLogFilenamePattern: logs/datastet-service-%d.log +# archivedFileCount: 7 +# timeZone: UTC # uncomment to have the logs in json format #layout: # type: json diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index f8c4790..dc49d78 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -452,7 +452,7 @@ public static Response extractTEI(final InputStream inputStream, } } } - json.append("], \"references\":["); + json.append("], \"references\":[]"); // if (extractionResult != null) { // List bibDataSet = extractionResult.getRight(); From 288850f13158289c527627786db75703b9ff1143 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 May 2024 11:27:40 +0900 Subject: [PATCH 13/46] add the rest of the processing --- build.gradle | 11 + .../grobid/core/engines/DatasetParser.java | 324 ++++---- .../core/engines/DatasetParserTest.java | 38 + ...om scientific literature.with_urls.tei.xml | 760 ++++++++++++++++++ .../core/engines/erl_18_11_114012.tei.xml | 540 +++++++++++++ 5 files changed, 1517 insertions(+), 156 deletions(-) create mode 100644 src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml create mode 100644 src/test/resources/org/grobid/core/engines/erl_18_11_114012.tei.xml diff --git a/build.gradle b/build.gradle index 415ac45..5243eac 100644 --- a/build.gradle +++ b/build.gradle @@ -173,6 +173,17 @@ ext.getArg = { propName, defaultVal -> return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal; } +task integration(type: Test) { + include '**' + maxHeapSize = "1024m" + + if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { + jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", + "--add-opens", "java.base/java.io=ALL-UNNAMED" + } + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries +} + import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar apply plugin: 'java' diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index e00f0cf..bd529a9 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -9,6 +9,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.grobid.core.GrobidModel; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.DatastetAnalyzer; import org.grobid.core.data.*; @@ -39,6 +40,8 @@ import org.grobid.core.utilities.counters.impl.CntManagerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -87,6 +90,10 @@ private static synchronized void getNewInstance(DatastetConfiguration configurat instance = new DatasetParser(configuration); } + protected DatasetParser(GrobidModel model) { + super(model); + } + private DatasetParser(DatastetConfiguration configuration) { super(DatasetModels.DATASET, CntManagerFactory.getCntManager(), GrobidCRFEngine.valueOf(configuration.getModel("datasets").engine.toUpperCase()), @@ -782,6 +789,7 @@ public Pair>, Document> processPDF(File file, if (previousSection == null || previousSection.equals("das")) { if (curParagraphTokens == null) curParagraphTokens = new ArrayList<>(); + //TODO: LF: why this is taken but not the body? curParagraphTokens.addAll(localTokenization); } } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) || clusterLabel.equals(TaggingLabels.ITEM)) { @@ -1187,6 +1195,7 @@ public Pair>, Document> processPDF(File file, // we attach and match bibliographical reference callout TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); // second pass, body + //TODO: LF: why only the body? if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { List bibRefComponents = new ArrayList(); for (TaggingTokenCluster cluster : bodyClusters) { @@ -1390,7 +1399,7 @@ public Pair>, List> processTEI(File file, boolean } /** - * Tranform an XML document (for example JATS) to a TEI document. + * Transform an XML document (for example JATS) to a TEI document. * Transformation of the XML/JATS/NLM/etc. document is realised thanks to Pub2TEI * (https://github.com/kermitt2/pub2tei) * @@ -1462,6 +1471,9 @@ public Pair>, List> processTEIDocument(org.w3c.do boolean addParagraphContext) { List selectedSequences = new ArrayList<>(); + //The references callout are loaded here, so that we can recover the position in the text + // we need target, text value, and position (character related) + List>> selectedSequencesReferences = new ArrayList<>(); List relevantSectionsNamedDatasets = new ArrayList<>(); List relevantSectionsImplicitDatasets = new ArrayList<>(); @@ -1475,7 +1487,7 @@ public Pair>, List> processTEIDocument(org.w3c.do doc, XPathConstants.NODE); if (titleNode == null) { - LOGGER.warn("Title was not founded, skipping."); + LOGGER.warn("Title was not found, skipping."); } else { String textTitle = titleNode.getTextContent(); selectedSequences.add(textTitle); @@ -1485,7 +1497,7 @@ public Pair>, List> processTEIDocument(org.w3c.do } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Title was not founded, skipping."); + LOGGER.warn("Title was not found, skipping."); } try { @@ -1496,8 +1508,8 @@ public Pair>, List> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < abstractNodeList.getLength(); i++) { org.w3c.dom.Node item = abstractNodeList.item(i); - String abstractSentence = item.getTextContent(); - selectedSequences.add(abstractSentence); + String text = item.getTextContent(); + selectedSequences.add(text); //LF Not clear why true, just copied from around ProcessPDF:578 relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(false); @@ -1505,7 +1517,7 @@ public Pair>, List> processTEIDocument(org.w3c.do } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Abstract was not founded, skipping."); + LOGGER.warn("Abstract was not found, skipping."); } try { @@ -1524,7 +1536,13 @@ public Pair>, List> processTEIDocument(org.w3c.do } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Keywords was not founded, skipping."); + LOGGER.warn("Keywords was not found, skipping."); + } + + // Fill up the references to match the current sentence/paragraphs + + for (String seq : selectedSequences) { + selectedSequencesReferences.add(new HashMap<>()); } // Extraction from Body @@ -1536,19 +1554,23 @@ public Pair>, List> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < bodyNodeList.getLength(); i++) { org.w3c.dom.Node item = bodyNodeList.item(i); - String abstractSentence = item.getTextContent(); - selectedSequences.add(abstractSentence); + String text = item.getTextContent(); + selectedSequences.add(text); + //LF Not clear why true, just copied from around ProcessPDF:635 relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); + + Map> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); + selectedSequencesReferences.add(referencesInText); } } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Abstract was not founded, skipping."); + LOGGER.warn("Body was not found, skipping."); } - // Various statements (acknowledgement, funding, data availaiblity) + // Various statements (acknowledgement, funding, data availability) // funding and acknowledgement at the moment have only paragraphs (Grobid issue # List sectionTypesOnlyParagraphs = Arrays.asList("acknowledgement", "funding"); @@ -1561,20 +1583,20 @@ public Pair>, List> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < annexNodeList.getLength(); i++) { org.w3c.dom.Node item = annexNodeList.item(i); - String abstractSentence = item.getTextContent(); - selectedSequences.add(abstractSentence); + String text = item.getTextContent(); + selectedSequences.add(text); + selectedSequencesReferences.add(new HashMap<>()); relevantSectionsNamedDatasets.add(false); relevantSectionsImplicitDatasets.add(false); } } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Abstract was not founded, skipping."); + LOGGER.warn("Abstract was not found, skipping."); } } // Annex might contain misclassified relevant sections - try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='annex']]/*[local-name() = 'div']"; org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, @@ -1604,6 +1626,7 @@ public Pair>, List> processTEIDocument(org.w3c.do String paragraph = paragraphAnnex.getTextContent(); selectedSequences.add(paragraph); + selectedSequencesReferences.add(new HashMap<>()); if (StringUtils.equals(currentSection, "das")) { relevantSectionsNamedDatasets.add(true); @@ -1617,7 +1640,7 @@ public Pair>, List> processTEIDocument(org.w3c.do } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Annex was not founded, skipping."); + LOGGER.warn("Annex was not found, skipping."); } // availability statement have sentences @@ -1635,9 +1658,10 @@ public Pair>, List> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < annexNodeList.getLength(); i++) { org.w3c.dom.Node item = annexNodeList.item(i); - String abstractSentence = item.getTextContent(); - selectedSequences.add(abstractSentence); - availabilityTokens.addAll(analyzer.tokenizeWithLayoutToken(abstractSentence)); + String text = item.getTextContent(); + selectedSequences.add(text); + selectedSequencesReferences.add(new HashMap<>()); + availabilityTokens.addAll(analyzer.tokenizeWithLayoutToken(text)); relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); } @@ -1657,8 +1681,9 @@ public Pair>, List> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < bodyNodeList.getLength(); i++) { org.w3c.dom.Node item = bodyNodeList.item(i); - String abstractSentence = item.getTextContent(); - selectedSequences.add(abstractSentence); + String text = item.getTextContent(); + selectedSequences.add(text); + selectedSequencesReferences.add(new HashMap<>()); //LF Not clear why true, just copied from around ProcessPDF:635 relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(false); @@ -1669,9 +1694,61 @@ public Pair>, List> processTEIDocument(org.w3c.do LOGGER.warn("Footnotes were not found or an error was thrown, skipping."); } + // Read and parse references + Map> referenceMap = new HashMap<>(); + try { + String expression = "//*[local-name() = 'div'][@*[local-name()='type' and .='references']]/*[local-name() = 'listBibl']/*[local-name() = 'biblStruct']"; + org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); - //Dataset Recognition + for (int i = 0; i < bodyNodeList.getLength(); i++) { + org.w3c.dom.Node item = bodyNodeList.item(i); + if (item.hasAttributes()) { + for (int a = 0; a < item.getAttributes().getLength(); a++) { + org.w3c.dom.Node attribute = item.getAttributes().item(a); + if (attribute.getNodeName().equals("xml:id")) { + String referenceText = item.getTextContent(); + String cleanedRawReferenceText = referenceText.replaceAll("\\s", " ").strip().replaceAll("[ ]{2,}", ", "); + referenceMap.put(attribute.getNodeValue(), Pair.of(cleanedRawReferenceText, item)); + } + } + } + } + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Something wrong when extracting references. Skipping them."); + } + + + // We need to link the references and their callout + List bibRefComponents = new ArrayList<>(); + Map biblioRefMap = new HashMap<>(); + for(Map> ref :selectedSequencesReferences) { + for (String refText : ref.keySet()) { + Pair infos = ref.get(refText); + + String target = infos.getRight(); + OffsetPosition position = infos.getLeft(); + + Pair referenceInformation = referenceMap.get(target); + if (referenceInformation != null) { + BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); + biblioRefMap.put(refText, biblioItem); + BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", ""))); + biblioComponent.setRawForm(refText); + biblioComponent.setOffsetStart(position.start); + biblioComponent.setOffsetEnd(position.end); + // TODO: fetch the coords if they are in the TEI +// List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); +// biblioComponent.setBoundingBoxes(boundingBoxes); + bibRefComponents.add(biblioComponent); + } + } + } + + //Dataset Recognition List> entities = new ArrayList<>(); List> selectedSequencesLayoutTokens = new ArrayList<>(); @@ -1765,9 +1842,6 @@ public Pair>, List> processTEIDocument(org.w3c.do } entities = newEntities; - // selection of relevant data sections - //List relevantSections = DataseerParser.getInstance().processingText(segments, sectionTypes, nbDatasets, datasetTypes); - // filter implicit datasets based on selected relevant data section List> filteredEntities = new ArrayList<>(); for (int i = 0; i < entities.size(); i++) { @@ -1804,137 +1878,75 @@ public Pair>, List> processTEIDocument(org.w3c.do } entities = filteredEntities; - // we attach and match bibliographical reference callout -// TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); - // second pass, body - -// if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { -// List bibRefComponents = new ArrayList(); -// for (TaggingTokenCluster cluster : bodyClusters) { -// if (cluster == null) { -// continue; -// } -// -// TaggingLabel clusterLabel = cluster.getTaggingLabel(); -// -// List localTokenization = cluster.concatTokens(); -// if ((localTokenization == null) || (localTokenization.size() == 0)) -// continue; -// -// if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { -// List refTokens = TextUtilities.dehyphenize(localTokenization); -// String chunkRefString = LayoutTokensUtil.toText(refTokens); -// -// List refNodes = formatter.markReferencesTEILuceneBased(refTokens, -// doc.getReferenceMarkerMatcher(), -// true, // generate coordinates -// false); // do not mark unsolved callout as ref -// -// if (refNodes != null) { -// for (Node refNode : refNodes) { -// if (refNode instanceof Element) { -// // get the bib ref key -// String refKey = ((Element) refNode).getAttributeValue("target"); -// -// if (refKey == null) -// continue; -// -// int refKeyVal = -1; -// if (refKey.startsWith("#b")) { -// refKey = refKey.substring(2, refKey.length()); -// try { -// refKeyVal = Integer.parseInt(refKey); -// } catch (Exception e) { -// LOGGER.warn("Invalid ref identifier: " + refKey); -// } -// } -// if (refKeyVal == -1) -// continue; -// -// // get the bibref object -// BibDataSet resBib = resCitations.get(refKeyVal); -// if (resBib != null) { -// BiblioComponent biblioComponent = new BiblioComponent(resBib.getResBib(), refKeyVal); -// biblioComponent.setRawForm(refNode.getValue()); -// biblioComponent.setOffsetStart(refTokens.get(0).getOffset()); -// biblioComponent.setOffsetEnd(refTokens.get(refTokens.size() - 1).getOffset() + -// refTokens.get(refTokens.size() - 1).getText().length()); -// List boundingBoxes = BoundingBoxCalculator.calculate(refTokens); -// biblioComponent.setBoundingBoxes(boundingBoxes); -// bibRefComponents.add(biblioComponent); -// } -// } -// } -// } -// } -// } -// -// -// if (bibRefComponents.size() > 0) { -// // attach references to dataset entities -// entities = attachRefBib(entities, bibRefComponents); -// } -// -// // consolidate the attached ref bib (we don't consolidate all bibliographical references -// // to avoid useless costly computation) -// List citationsToConsolidate = new ArrayList(); -// List consolidated = new ArrayList(); -// for (List datasets : entities) { -// for (Dataset entity : datasets) { -// if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { -// List bibRefs = entity.getBibRefs(); -// for (BiblioComponent bibRef : bibRefs) { -// Integer refKeyVal = Integer.valueOf(bibRef.getRefKey()); -// if (!consolidated.contains(refKeyVal)) { -// citationsToConsolidate.add(resCitations.get(refKeyVal)); -// consolidated.add(refKeyVal); -// } -// } -// } -// } -// } -// -// try { -// Consolidation consolidator = Consolidation.getInstance(); -// Map resConsolidation = consolidator.consolidate(citationsToConsolidate); -// for (int j = 0; j < citationsToConsolidate.size(); j++) { -// BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); -// BiblioItem bibo = resConsolidation.get(j); -// if (bibo != null) { -// BiblioItem.correct(resCitation, bibo); -// } -// } -// } catch (Exception e) { -// throw new GrobidException( -// "An exception occured while running consolidation on bibliographical references.", e); -// } -// -// // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. -// for (List datasets1 : entities) { -// for (Dataset entity1 : datasets1) { -// if (entity1.getBibRefs() != null && entity1.getBibRefs().size() > 0) { -// for (List datasets2 : entities) { -// for (Dataset entity2 : datasets2) { -// if (entity2.getBibRefs() != null) { -// continue; -// } -// if ((entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && -// entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && -// (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || -// entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) -// ) { -// List newBibRefs = new ArrayList<>(); -// for (BiblioComponent bibComponent : entity1.getBibRefs()) { -// newBibRefs.add(new BiblioComponent(bibComponent)); -// } -// entity2.setBibRefs(newBibRefs); -// } -// } -// } -// } -// } -// } -// } + + // Enhance information in dataset entities + if (CollectionUtils.isNotEmpty(bibRefComponents)) { + // attach references to dataset entities + entities = attachRefBib(entities, bibRefComponents); + } + + // consolidate the attached ref bib (we don't consolidate all bibliographical references + // to avoid useless costly computation) + List citationsToConsolidate = new ArrayList<>(); + List consolidated = new ArrayList<>(); + for (List datasets : entities) { + for (Dataset entity : datasets) { + if (CollectionUtils.isNotEmpty(entity.getBibRefs())) { + List bibRefs = entity.getBibRefs(); + for (BiblioComponent bibRef : bibRefs) { + Integer refKeyVal = bibRef.getRefKey(); + if (!consolidated.contains(refKeyVal)) { + BiblioItem biblioItem = biblioRefMap.get(refKeyVal); + BibDataSet biblioDataSet = new BibDataSet(); + biblioDataSet.setResBib(biblioItem); + citationsToConsolidate.add(biblioDataSet); + consolidated.add(refKeyVal); + } + } + } + } + } + + try { + Consolidation consolidator = Consolidation.getInstance(); + Map resConsolidation = consolidator.consolidate(citationsToConsolidate); + for (int j = 0; j < citationsToConsolidate.size(); j++) { + BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); + BiblioItem bibo = resConsolidation.get(j); + if (bibo != null) { + BiblioItem.correct(resCitation, bibo); + } + } + } catch (Exception e) { + throw new GrobidException( + "An exception occured while running consolidation on bibliographical references.", e); + } + + // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. + for (List datasets1 : entities) { + for (Dataset entity1 : datasets1) { + if (CollectionUtils.isNotEmpty(entity1.getBibRefs())) { + for (List datasets2 : entities) { + for (Dataset entity2 : datasets2) { + if (entity2.getBibRefs() != null) { + continue; + } + if ((entity2.getDatasetName() != null && entity2.getDatasetName().getRawForm() != null && + entity1.getDatasetName() != null && entity1.getDatasetName().getRawForm() != null) && + (entity2.getDatasetName().getNormalizedForm().equals(entity1.getDatasetName().getNormalizedForm()) || + entity2.getDatasetName().getRawForm().equals(entity1.getDatasetName().getRawForm())) + ) { + List newBibRefs = new ArrayList<>(); + for (BiblioComponent bibComponent : entity1.getBibRefs()) { + newBibRefs.add(new BiblioComponent(bibComponent)); + } + entity2.setBibRefs(newBibRefs); + } + } + } + } + } + } // mark datasets present in Data Availability section(s) if (CollectionUtils.isNotEmpty(availabilityTokens)) { diff --git a/src/test/java/org/grobid/core/engines/DatasetParserTest.java b/src/test/java/org/grobid/core/engines/DatasetParserTest.java index dd6eec0..d0d0db9 100644 --- a/src/test/java/org/grobid/core/engines/DatasetParserTest.java +++ b/src/test/java/org/grobid/core/engines/DatasetParserTest.java @@ -1,12 +1,41 @@ package org.grobid.core.engines; import junit.framework.TestCase; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.grobid.core.GrobidModels; +import org.grobid.core.data.BibDataSet; +import org.grobid.core.data.Dataset; +import org.grobid.core.main.GrobidHomeFinder; +import org.grobid.core.utilities.GrobidConfig; +import org.grobid.core.utilities.GrobidProperties; +import org.junit.Before; import org.junit.Test; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +import static com.google.common.base.Predicates.notNull; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertThat; public class DatasetParserTest extends TestCase { + private DatasetParser target; + + @Before + public void setUp() throws Exception { + GrobidProperties.getInstance(new GrobidHomeFinder(Arrays.asList("../../grobid/grobid-home/"))); + GrobidConfig.ModelParameters modelParameters = new GrobidConfig.ModelParameters(); + modelParameters.name = "bao"; + GrobidProperties.addModel(modelParameters); + target = new DatasetParser(GrobidModels.DUMMY); + } + @Test public void testGetXPathWithoutNamespaces() { @@ -15,4 +44,13 @@ public void testGetXPathWithoutNamespaces() { assertThat(output, is("//*[local-name() = 'abstract']/*[local-name() = 'p']/*[local-name() = 's']")); } + @Test + public void testProcessTEIDocument() throws Exception { + String text = IOUtils.toString(Objects.requireNonNull(this.getClass().getResourceAsStream("erl_18_11_114012.tei.xml")), StandardCharsets.UTF_8); + + Pair>, List> listListPair = target.processTEIDocument(text, true, false, false); + + assertThat(listListPair, is(notNull())); + + } } \ No newline at end of file diff --git a/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml b/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml new file mode 100644 index 0000000..2ab3daa --- /dev/null +++ b/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml @@ -0,0 +1,760 @@ + + + + + + Science and Technology of Advanced Materials: Methods + + MEXT + + + unknown + + + Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) + + + + + + + + 14 Dec 2023. + + + + + + LucaFoppiano + 0000-0002-6114-6164 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +

+ Tsukuba + Japan; +
+ + + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+ + + TomoyaMato + 0000-0002-0918-6468 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + KenseiTerashima + 0000-0003-0375-3043 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + PedroOrtiz Suarez + 0000-0003-0343-8852 + + GmbH DFKI + CONTACT Luca Foppiano +
+ Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii + 0000-0003-0375 + Berlin + DE +
+
+
+ + TakuTou + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ChikakoSakai + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Wei-ShengWang + 0009-0001-3572-5736 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ToshiyukiAmagasa + 0000-0003-0595-2230 + + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+
+ + YoshihikoTakano + 0000-0002-1541-6928 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + MasashiIshii + ishii.masashi@nims.go.jp + 0000-0003-0357-2832 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Masashi + + Science and Technology of Advanced Materials: Methods + + + Print + + 14 Dec 2023. + + + DCB0425EE18794E34CC3A3075E3E3975 + 10.1080/27660400.2023.2286219 + Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023 + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + Materials informatics + superconductors + machine learning + database + TDM + + + +

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

+
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

+
+
+ + + +
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

+
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

+
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

+
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

+
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

+
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

+
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

+
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

+
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

+
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

+
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

+
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

+
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

+
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

+
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

+
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

+
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

+
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

+
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

+
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
+
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
+
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
+
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
+
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
+
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
+
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
+
+
c classification: The temperature is not correctly classified
+
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

+
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

+ + + +
+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
+
+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+
+
+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
+ + + 305-0044 + + +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

+
+ + + + + + Automatic extraction of materials and properties from superconductors scientific literature + + LFoppiano + + + PBCastro + + + POSuarez + + 10.1080/27660400.2022.2153633 + + + Sci Technol Adv Mater + + 3 + 1 + 2153633 + 2023 + + + + + + + Materials discovery with machine learning and knowledge discovery + + ONOliveira + + + MJOliveira + + 10.3389/fchem.2022.930369 + + + Front Chem + + 10 + 10 + 2022 + + + + + + + Commentary: the materials project: a materials genome approach to accelerating materials innovation + + AJain + + + SPOng + + + GHautier + + 10.1063/1.4812323 + + + APL Mater + + 1 + 1 + 11002 + 2013 + + + + + + + Aflow: an automatic framework for high-throughput materials discovery + + SCurtarolo + + + WSetyawan + + + GLHart + + + + + Comput Mater Sci + + 58 + + 2012 + + + + + + + The nomad laboratory: from data sharing to artificial intelligence + + CDraxl + + + MScheffler + + 10.1088/2515-7639/ab13bb + + + J Phys Mater + + 2 + 3 + 36001 + 2019 + + + + + + + Global publication productivity in materials science research: a scientometric analysis + + TPratheepan + + + + + Indian J Inf Sources Serv + + 9 + 1 + + 2019 Feb + + + + + + + The PAULING FILE project and materials platform for data science: from big data toward materials genome + + EBlokhin + + + PVillars + + 10.1007/978-3-319-42913-7_62-1 + + 2018 + Springer International Publishing + + Cham + + + + + + + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + + MIshii + + + KSakamoto + + 10.1080/27660400.2023.2223051 + + + Sci Technol Adv Mater + + 3 + 1 + 2223051 + 2023 + + + + + + + Predicting new superconductors and their critical temperatures using machine learning + + BRoter + + + SDordevic + + 10.1016/j.physc.2020.1353689 + + + Phys C + + 575 + 1353689 + 2020 + + + + + + + Machine learning modeling of superconducting critical temperature + + VStanev + + + COses + + + AKusne + + 10.1038/s41524-018-0085-8 + + + Npj Comput Mater + + 4 + 1 + 4 + 2017 + + + + + + + Machine-learning approach for discovery of conventional superconductors + + HTran + + + TNVu + + arXiv:221103265. 2022 + + + arXiv preprint + + + + + Deep learning model for finding new superconductors + + TKonno + + + HKurokawa + + + FNabeshima + + 10.1103/PhysRevB.103.014509 + + + Phys Rev B + + 103 + 1 + 14509 + 2021 + + + + + + + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + + JCKlie + + + MBugert + + + BBoullosa + + + + + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
+ + 2018 + + +
+
+ + + + Doccano: text annotation tool for human + + HNakayama + + + TKubo + + + JKamura + + + + + Software + + 2018 + + + + + + + Python materials genomics pymatgen: a robust open-source python library for materials analysis + + SPOng + + + WDRichards + + + AJain + + 10.1016/j.commatsci.2012.10.028 + + + Comput Mater Sci + + 68 + 2 + + 2013 + + + + + + + Text-mined dataset of inorganic materials synthesis recipes. Sci Data + + OKononova + + + HHuo + + + THe + + 10.1038/s41597-019-0224-1 + 41597-019-0224-1 + + + 2019 Oct + 6 + 203 + + + + + + + Label studio: data labeling software; 2020-2022 + + MTkachenko + + + MMalyuk + + + AHolmanyuk + + + + + Open source software + + + + + + + Supermat: construction of a linked annotated dataset from superconductors-related publications + + LFoppiano + + + SDieb + + + ASuzuki + + 10.1080/27660400.2021.1918396 + + + Sci Technol Adv Mater: Methods + + 1 + 1 + + 2021 + + + + + + + SciBERT: a pretrained language model for scientific text + + IBeltagy + + + KLo + + + ACohan + + + + + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
+ + Association for Computational Linguistics + Nov. 2019 + + +
+
+ + + + + <ptr target="https://github.com/kermitt2/delft"/> + </analytic> + <monogr> + <title level="j">DeLFT contributors. Delft + + 2018-2023 + + + + + + + Overcoming catastrophic forgetting in neural networks + + JKirkpatrick + + + RPascanu + + + NCRabinowitz + + abs/1612.00796 + + + + CoRr + + 2016 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Contributors</surname></persName> + </author> + <author> + <persName><surname>Grobid</surname></persName> + </author> + <ptr target="https://github.com/kermitt2/grobid"/> + <imprint> + <date type="published" when="2008">2008 -2023</date> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> \ No newline at end of file diff --git a/src/test/resources/org/grobid/core/engines/erl_18_11_114012.tei.xml b/src/test/resources/org/grobid/core/engines/erl_18_11_114012.tei.xml new file mode 100644 index 0000000..f116e5c --- /dev/null +++ b/src/test/resources/org/grobid/core/engines/erl_18_11_114012.tei.xml @@ -0,0 +1,540 @@ +<?xml version="1.0" encoding="UTF-8"?> +<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink"> + <teiHeader xml:lang="en"> + <fileDesc> + <titleStmt> + <title level="a" type="main">Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos + + unknown + + + Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan + + + NASA + + + NASA Carbon Monitoring System + + + + + + + + 13 October 2023 + + + + + + ShijuanChen + shijuan.chen@yale.edu + + Yale School of the Environment + Yale University +
+ New Haven + CT + United States of America +
+
+ + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+
+ + CurtisEWoodcock + + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+
+ + ThathevaSaphangthong + + Department of Agriculture Land Management + Ministry of Agriculture and Forestry +
+ Vientiane + Laos +
+
+
+ + PontusOlofsson + + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+ + NASA Marshall Space Flight Center +
+ Huntsville + AL + United States of America +
+
+
+ Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos +
+ + + 13 October 2023 + + + 17112CCE7BFA5F63FB9BFE897A9E1A85 + 10.1088/1748-9326/acffdd +
+
+ + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + shifting cultivation + shifting agriculture + slash and burn + swidden agriculture + forest degradation + carbon emissions + deforestation + + + +

Although shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown. This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades. This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years. The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos. Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive. Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020. The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020. In addition, there were increased emissions from intensified use of fallow lands. This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management. This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation.

+
+
+ + + +
Introduction

Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation. It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation. After short-term cultivation, the plot is abandoned, which allows the vegetation to recover. Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017, Curtis et al 2018, Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015). Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small. Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.

Shifting cultivation has both short-term and long-term effects on carbon emissions (Ziegler et al 2012). In the short term, the slash-and-burn activities cause immediate release of carbon. In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems. Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics. In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period. Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting.

In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001, Douangsavanh et al 2006, Epprecht et al 2018, Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018, Chen et al 2023). It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023). Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020. Shifting cultivation activities are expected to increase due to the increasing demand for rice. Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005). Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before Chen et al (2023), carbon emissions from shifting cultivation have not been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018).

Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently (Chen et al 2023), and a comprehensive national-scale analysis of the spatial and temporal patterns of shifting cultivation has not been conducted to date. A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year (Messerli et al 2009, Silva et al 2011, Hett et al 2012, Hurni et al 2013a). It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach. Another approach is to use multi-temporal land cover data to map shifting cultivation (Leisz and Rasmussen 2012, Molinario et al 2015, Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018, Adhikary et al 2019, Kurien et al 2019). In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns (Heinimann et al 2013). Recently, Chen et al (2023) used satellite data to create shifting cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to support a nationalscale spatial-temporal analysis. The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale (Tang et al 2020).

This study used the map products and reference data in Chen et al (2023), combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon dynamics of shifting cultivation in Laos. The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation.

+
Method
+
Mapping shifting cultivation

Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine (Chen et al 2023). CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos. CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) (Souza et al 2005) to monitor forest disturbances (figure 1). Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification. A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation. For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2). During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos (Chen et al 2023). Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% (Chen et al 2023). Chen et al (2023) describes more details of the monitoring method. Both the map products and the reference data from Chen et al (2023) were used in this study.

+
Spatial-temporal patterns of shifting cultivation

The annual maps of shifting cultivation and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation. We estimated the area of shifting cultivation at different fallow and disturbance 55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations. In the Landsat images (Red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location.).

to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020. In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g. figures 1(a), (c) and 2).

Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure 3 as an example). These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points. For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event.

+
Carbon emission/removal

The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020). GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data (Beck et al 2020, Dubayah et al 2022) were used to explore the effect of shifting cultivation on biomass. GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted. To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint. Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events. The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain. AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others. Intact Forest here is defined as forests without significant anthropogenic disturbances. The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed. The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity. From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands.

Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020. Table 1 shows the activity classes, definitions, and emission factors. New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve. Figure 8 shows an example of the spatially explicit emission factors for different activities. Specifically, this was how the carbon emissions and removals of Fallow land -> Fallow land, Fallow land -> Cleared land, and Cleared land -> Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps. Then, the AGBD of fallow lands was calculated using equation (1). Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained. Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated. The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories.

+
Results
+
Spatial-temporal patterns of shifting cultivation

A large proportion of the land used for shifting cultivation in Laos remains in use. During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%). In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops. New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure 4). The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates. From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001. In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year. Our results indicate that the extent of shifting cultivation has been expanding.

During 2001-2015, there was a decrease in the area of New Shifting Cultivation. However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020. The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure 5). Before 2007, the newly cultivated areas were larger than the previously cultivated, and the trend reversed after 2007. There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020. We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation.

Based on the sample interpretation results, most cultivation lengths are either one year or two years. Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2). The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). The fallow length has been continuously declining (figure 6). The reduction in the length of fallow periods indicates that shifting cultivation has intensified.

+
Growth curve of fallow lands

The AGBD was lower in shifting cultivation regions than in the intact forests. The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9 Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively. The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest. In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years. To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure 7). The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129 ln (x) + 9.907

(1)

AGBD was strongly correlated with years of regrowth. Equation ( 1) and the maps of years of regrowth were used to calculate the biomass of fallow lands and spatially explicit emission/removal factors (figure 8). in this period (figure 5) and the decrease in carbon sink of fallow lands in this period. For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions. From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions. Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015. However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land. To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation.

+
Carbon emissions from shifting cultivation
+
Discussion

In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed. The results showed that shifting cultivation has been expanding and intensifying. The area of shifting cultivation has increased significantly over the last 5 years. The fallow length has been declining continuously, which indicates the intensification of shifting cultivation. Our finding of a reduction of fallow length is consistent with previous local studies (Rasul and Thapa 2003, Saphangthong and Kono 2009, van Vliet et al 2012). We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be 2019) is understandable since their major focus is forest loss instead of shifting cultivation. This comparison is not a criticism of the aforementioned studies. Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns.

We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table S1). The Laos official forest change maps (https://nfms.maf.gov.la/) are created from the land cover classification maps from the start year and end year for each period (see the periods in table S1). Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation. There are consistencies in the period 2006-2010 and 2011-2015, with the differences between our estimates and the official statistics both less than 1% of Laos. Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010. This was partly due to the different monitoring approaches. Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end. In the period 2001-2005 and 2016-2020, our estimates are about 2%-3% higher than the official estimates. For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics. Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos.

Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring System https://nfms. maf.gov.la/. The shifting cultivation map was compared with 39 field points identified as 'Regenerating Vegetation' or 'Upland crop' in 2010, 2011, 2012, or 2019, since these two land cover classes are generally considered to have an association with shifting cultivation practices (Department of Forestry 2020). The 31 out of 39 (80%) points are correctly mapped as shifting cultivation.

As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management. The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020. Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos. Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates. The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years. The recent increase in new shifting cultivation events also led to an increase in net carbon emissions. This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands.

Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data. The first limitation is the usage of GEDI data. Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted. Future studies can use GEDI for multiple years as more data will be collected. In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain. This would introduce regional bias on the growth curve and emission factors. Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • (Chen 2022). Future research should improve GEDI biomass estimates in steep terrain. Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited. Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g. Salinas-Melgoza et al 2017, Borah et al 2018, Gogoi et al 2020 ). Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools. Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation. Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys.

+
Conclusion

Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos. Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years. The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening. Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified. The net carbon emissions from shifting cultivation declined in the past but increased recently. This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes.

USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070). The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper.

Figure 1 .Figure 1. An example of active shifting cultivation in previously cultivated land (location: 20 • 7 ′ 13 ′′ N, 101 • 6 ′ 59 ′′ E). The shifting cultivation events in 2005 and 2018 were categorized as Previous Shifting Cultivation because shifting cultivation first occurred in 1991. This place is also Active Shifting Cultivation because the latest shifting cultivation event occurred in 2018. (a) Landsat time series. (b) CCDC-SMA model fits. Different colors show different segments and the model breaks in 1991, 2005, and 2018 show slash and burn events. The colored lines show the seasonality of the forest and the drops between lines show slash-and-burn events. (c) Landsat images and high-resolution images on Google Earth. In the Landsat images (red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location.
+
Figure 2 .Figure 3 .Figure 2. An example of reference data (location: 20 • 15 ′ 8 ′′ N, 100• 39 ′ 51 ′′ E). This shifting cultivation is New Shifting Cultivation. The time series shows that no shifting cultivation occurred before 2019. The new shifting cultivation event occurred in 2019 and it can be verified by examining high-resolution images and Landsat images. (In the time series figure, the blue points are Landsat observations. In the Landsat images (red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location).
+
Figure 4 .Figure 4. Area estimates and uncertainties of New Shifting Cultivation and total (new and previous) shifting cultivation by 5 year intervals.The y-axis is the area proportions of the total area of Laos (230 405 km 2 ). Any pixel that was newly cultivated at any time within a specified 5 year period would be counted and added to the total height of the corresponding pink bars.
+
Figure 5 .Figure 5. Annual proportion of slash-and-burn areas in previously and newly cultivated regions.
+
Figure 6 .Figure 6. Average fallow length by year calculated from sample interpretation.
+
Figure 7 .Figure 7. Growth curve of aboveground biomass density.
+
+
Figure 8 .Figure 8. Spatially explicit emission (+)/removal (-) factors for different activities in a region (the background image is the high-resolution image): (a) fallow land -> fallow land; (b) fallow land -> cleared land; (c) cleared land -> fallow land; (d) total of (a)-(c).
+
Figure 9 .Figure 9. Carbon dynamics by period.
+
+
Table 1 .Activity classes, definitions, and carbon emission/removal factors for each 5 year period (CF: conversion factor to convert biomass to carbon equivalents, CF = 0.5).Activity classDefinitionEmission/removal factorsIntact forest -> shifting cultivationNo shifting cultivation before. Previous intactBiomass of forest before(New Shifting Cultivation)forests began to be used for new shiftingnew shiftingcultivation.cultivation × CF (75.95Mg C ha -1 )Fallow land -> fallow landShifting cultivation occurred before. The start(Fallow land biomass in theand end land cover were both fallow lands.start -fallow land biomassin the end) × CFFallow land -> cleared landIn previously cultivated land, fallow land became(Fallow landcleared land.biomass -cleared landbiomass) × CFCleared land -> fallow landIn previously cultivated land, cleared land became(Cleared landfallow land.biomass -fallow landbiomass) × CFCleared land -> cleared landIn previously cultivated land, cleared land becamecleared land.

Zero

and other activity classes in table 1 were estimated from the maps. This is because the samplingbased area estimates of New Shifting Cultivation adjusted errors in mapping and are more accurate than pixel-counting from the maps

(Olofsson et al 2013(Olofsson et al , 2014))

. The area estimates of New Shifting Cultivation were calculated by 5-year periods with low uncertainty. For other activity classes, it is difficult to get area estimates from the reference data while including the dynamics of biomass of fallow land, and thus we used a spatially explicit method. In table 1, the biomass of the forest before disturbance was the biomass of Intact Forest estimated from GEDI. The biomass of fallow land was estimated from the growth curve developed from GEDI based on years since disturbance. Years since disturbance for each pixel was obtained from the annual maps of shifting cultivation. The cleared land biomass was estimated as the biomass of non-forest by the Department of Forestry (2020) based on field surveys. The emission factor of New Shifting Cultivation is 75.95 Mg C ha -1 . The emission factor of Cleared land -> Cleared land is zero.

+
Table 2 .Area of difference land use activities for each period (5 years).Area (ha)2001-20052006-20102011-20152016-2020Fallow land -> fallow land2379 8473809 0085213 5616009 880Fallow land -> cleared land226 240361 992397 236630 467Cleared land -> fallow land441 757768 342748 692696 501New shifting cultivation1198 106806 418714 2561036 823
+
Table 3 .The country-average emissions or removal factors for each period (5 years). The original emission or removal factors except for new shifting cultivation are spatially explicit. This table shows the country averages of the spatial explicit emission or removal factors.Average emission/removal factors (Mg C/ha -1 )2001-20052006-20102011-20152016-2020Fallow land -> fallow land-8.06-7.57-5.65-1.56Fallow land -> cleared land18.7019.2623.5826.10Cleared land -> fallow land-23.14-24.28-23.67-21.36New shifting cultivation75.9575.9575.9575.95
+
Table 4 .Carbon emissions (+) and removals (-) of different activities for each period (5 years).Carbon emission/removal(Mg C)2001-20052006-20102011-20152016-2020Fallow land -> fallow-19 175 009-28 833 216-29 440 602-9348 118landFallow land -> cleared4230 2906970 9569366 23616 452 893landCleared land -> fallow-10 222 046-18 657 539-17 717 827-14 879 752landNew shifting cultivation90 996 15161 247 40954 247 70578 746 669Period total (net65 829 38720 727 61016 455 51270 971 692emission/removal)Annual average13 165 8774145 5223291 10314 194 339
+
+
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

+
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

+ + + +
+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
+
+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+
+
+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
+ + + 305-0044 + + +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

+
+ + + + + + Automatic extraction of materials and properties from superconductors scientific literature + + LFoppiano + + + PBCastro + + + POSuarez + + 10.1080/27660400.2022.2153633 + + + Sci Technol Adv Mater + + 3 + 1 + 2153633 + 2023 + + + + + + + Materials discovery with machine learning and knowledge discovery + + ONOliveira + + + MJOliveira + + 10.3389/fchem.2022.930369 + + + Front Chem + + 10 + 10 + 2022 + + + + + + + Commentary: the materials project: a materials genome approach to accelerating materials innovation + + AJain + + + SPOng + + + GHautier + + 10.1063/1.4812323 + + + APL Mater + + 1 + 1 + 11002 + 2013 + + + + + + + Aflow: an automatic framework for high-throughput materials discovery + + SCurtarolo + + + WSetyawan + + + GLHart + + + + + Comput Mater Sci + + 58 + + 2012 + + + + + + + The nomad laboratory: from data sharing to artificial intelligence + + CDraxl + + + MScheffler + + 10.1088/2515-7639/ab13bb + + + J Phys Mater + + 2 + 3 + 36001 + 2019 + + + + + + + Global publication productivity in materials science research: a scientometric analysis + + TPratheepan + + + + + Indian J Inf Sources Serv + + 9 + 1 + + 2019 Feb + + + + + + + The PAULING FILE project and materials platform for data science: from big data toward materials genome + + EBlokhin + + + PVillars + + 10.1007/978-3-319-42913-7_62-1 + + 2018 + Springer International Publishing + + Cham + + + + + + + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + + MIshii + + + KSakamoto + + 10.1080/27660400.2023.2223051 + + + Sci Technol Adv Mater + + 3 + 1 + 2223051 + 2023 + + + + + + + Predicting new superconductors and their critical temperatures using machine learning + + BRoter + + + SDordevic + + 10.1016/j.physc.2020.1353689 + + + Phys C + + 575 + 1353689 + 2020 + + + + + + + Machine learning modeling of superconducting critical temperature + + VStanev + + + COses + + + AKusne + + 10.1038/s41524-018-0085-8 + + + Npj Comput Mater + + 4 + 1 + 4 + 2017 + + + + + + + Machine-learning approach for discovery of conventional superconductors + + HTran + + + TNVu + + arXiv:221103265. 2022 + + + arXiv preprint + + + + + Deep learning model for finding new superconductors + + TKonno + + + HKurokawa + + + FNabeshima + + 10.1103/PhysRevB.103.014509 + + + Phys Rev B + + 103 + 1 + 14509 + 2021 + + + + + + + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + + JCKlie + + + MBugert + + + BBoullosa + + + + + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
+ + 2018 + + +
+
+ + + + Doccano: text annotation tool for human + + HNakayama + + + TKubo + + + JKamura + + + + + Software + + 2018 + + + + + + + Python materials genomics pymatgen: a robust open-source python library for materials analysis + + SPOng + + + WDRichards + + + AJain + + 10.1016/j.commatsci.2012.10.028 + + + Comput Mater Sci + + 68 + 2 + + 2013 + + + + + + + Text-mined dataset of inorganic materials synthesis recipes. Sci Data + + OKononova + + + HHuo + + + THe + + 10.1038/s41597-019-0224-1 + 41597-019-0224-1 + + + 2019 Oct + 6 + 203 + + + + + + + Label studio: data labeling software; 2020-2022 + + MTkachenko + + + MMalyuk + + + AHolmanyuk + + + + + Open source software + + + + + + + Supermat: construction of a linked annotated dataset from superconductors-related publications + + LFoppiano + + + SDieb + + + ASuzuki + + 10.1080/27660400.2021.1918396 + + + Sci Technol Adv Mater: Methods + + 1 + 1 + + 2021 + + + + + + + SciBERT: a pretrained language model for scientific text + + IBeltagy + + + KLo + + + ACohan + + + + + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
+ + Association for Computational Linguistics + Nov. 2019 + + +
+
+ + + + + <ptr target="https://github.com/kermitt2/delft"/> + </analytic> + <monogr> + <title level="j" coords="11,331.03,444.75,110.24,8.78">DeLFT contributors. Delft + + 2018-2023 + + + + + + + Overcoming catastrophic forgetting in neural networks + + JKirkpatrick + + + RPascanu + + + NCRabinowitz + + abs/1612.00796 + + + + CoRr + + 2016 + + + + + + + + <author> + <persName coords=""><forename type="first">G</forename><surname>Contributors</surname></persName> + </author> + <author> + <persName coords=""><surname>Grobid</surname></persName> + </author> + <ptr target="https://github.com/kermitt2/grobid"/> + <imprint> + <date type="published" when="2008">2008 -2023</date> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> \ No newline at end of file diff --git a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml new file mode 100644 index 0000000..b65cb98 --- /dev/null +++ b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml @@ -0,0 +1,760 @@ +<?xml version="1.0" encoding="UTF-8"?> +<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink"> + <teiHeader xml:lang="en"> + <fileDesc> + <titleStmt> + <title level="a" type="main">Science and Technology of Advanced Materials: Methods + + Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) + + + MEXT + + + unknown + + + + + + + + 14 Dec 2023. + + + + + + LucaFoppiano + 0000-0002-6114-6164 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+ + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+
+ + TomoyaMato + 0000-0002-0918-6468 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + KenseiTerashima + 0000-0003-0375-3043 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + PedroOrtiz Suarez + 0000-0003-0343-8852 + + GmbH DFKI + CONTACT Luca Foppiano +
+ Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii + 0000-0003-0375 + Berlin + DE +
+
+
+ + TakuTou + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ChikakoSakai + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Wei-ShengWang + 0009-0001-3572-5736 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ToshiyukiAmagasa + 0000-0003-0595-2230 + + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+
+ + YoshihikoTakano + 0000-0002-1541-6928 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + MasashiIshii + ishii.masashi@nims.go.jp + 0000-0003-0357-2832 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Masashi + + Science and Technology of Advanced Materials: Methods +
+ + Print + + 14 Dec 2023. + + + DCB0425EE18794E34CC3A3075E3E3975 + 10.1080/27660400.2023.2286219 + Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023 +
+
+ + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + Materials informatics + superconductors + machine learning + database + TDM + + + +

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

+
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

+
+
+ + + +
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

+
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

+
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

+
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

+
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

+
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

+
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

+
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

+
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

+
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

+
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

+
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

+
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

+
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

+
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

+
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

+
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

+
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

+
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

+
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
+
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
+
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
+
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
+
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
+
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
+
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
+
+
c classification: The temperature is not correctly classified
+
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

+
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

+ + + +
+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
+
+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+
+
+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
+ + + 305-0044 + + +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

+
+ + + + + + Automatic extraction of materials and properties from superconductors scientific literature + + LFoppiano + + + PBCastro + + + POSuarez + + 10.1080/27660400.2022.2153633 + + + Sci Technol Adv Mater + + 3 + 1 + 2153633 + 2023 + + + + + + + Materials discovery with machine learning and knowledge discovery + + ONOliveira + + + MJOliveira + + 10.3389/fchem.2022.930369 + + + Front Chem + + 10 + 10 + 2022 + + + + + + + Commentary: the materials project: a materials genome approach to accelerating materials innovation + + AJain + + + SPOng + + + GHautier + + 10.1063/1.4812323 + + + APL Mater + + 1 + 1 + 11002 + 2013 + + + + + + + Aflow: an automatic framework for high-throughput materials discovery + + SCurtarolo + + + WSetyawan + + + GLHart + + + + + Comput Mater Sci + + 58 + + 2012 + + + + + + + The nomad laboratory: from data sharing to artificial intelligence + + CDraxl + + + MScheffler + + 10.1088/2515-7639/ab13bb + + + J Phys Mater + + 2 + 3 + 36001 + 2019 + + + + + + + Global publication productivity in materials science research: a scientometric analysis + + TPratheepan + + + + + Indian J Inf Sources Serv + + 9 + 1 + + 2019 Feb + + + + + + + The PAULING FILE project and materials platform for data science: from big data toward materials genome + + EBlokhin + + + PVillars + + 10.1007/978-3-319-42913-7_62-1 + + 2018 + Springer International Publishing + + Cham + + + + + + + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + + MIshii + + + KSakamoto + + 10.1080/27660400.2023.2223051 + + + Sci Technol Adv Mater + + 3 + 1 + 2223051 + 2023 + + + + + + + Predicting new superconductors and their critical temperatures using machine learning + + BRoter + + + SDordevic + + 10.1016/j.physc.2020.1353689 + + + Phys C + + 575 + 1353689 + 2020 + + + + + + + Machine learning modeling of superconducting critical temperature + + VStanev + + + COses + + + AKusne + + 10.1038/s41524-018-0085-8 + + + Npj Comput Mater + + 4 + 1 + 4 + 2017 + + + + + + + Machine-learning approach for discovery of conventional superconductors + + HTran + + + TNVu + + arXiv:221103265. 2022 + + + arXiv preprint + + + + + Deep learning model for finding new superconductors + + TKonno + + + HKurokawa + + + FNabeshima + + 10.1103/PhysRevB.103.014509 + + + Phys Rev B + + 103 + 1 + 14509 + 2021 + + + + + + + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + + JCKlie + + + MBugert + + + BBoullosa + + + + + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
+ + 2018 + + +
+
+ + + + Doccano: text annotation tool for human + + HNakayama + + + TKubo + + + JKamura + + + + + Software + + 2018 + + + + + + + Python materials genomics pymatgen: a robust open-source python library for materials analysis + + SPOng + + + WDRichards + + + AJain + + 10.1016/j.commatsci.2012.10.028 + + + Comput Mater Sci + + 68 + 2 + + 2013 + + + + + + + Text-mined dataset of inorganic materials synthesis recipes. Sci Data + + OKononova + + + HHuo + + + THe + + 10.1038/s41597-019-0224-1 + 41597-019-0224-1 + + + 2019 Oct + 6 + 203 + + + + + + + Label studio: data labeling software; 2020-2022 + + MTkachenko + + + MMalyuk + + + AHolmanyuk + + + + + Open source software + + + + + + + Supermat: construction of a linked annotated dataset from superconductors-related publications + + LFoppiano + + + SDieb + + + ASuzuki + + 10.1080/27660400.2021.1918396 + + + Sci Technol Adv Mater: Methods + + 1 + 1 + + 2021 + + + + + + + SciBERT: a pretrained language model for scientific text + + IBeltagy + + + KLo + + + ACohan + + + + + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
+ + Association for Computational Linguistics + Nov. 2019 + + +
+
+ + + + + <ptr target="https://github.com/kermitt2/delft"/> + </analytic> + <monogr> + <title level="j">DeLFT contributors. Delft + + 2018-2023 + + + + + + + Overcoming catastrophic forgetting in neural networks + + JKirkpatrick + + + RPascanu + + + NCRabinowitz + + abs/1612.00796 + + + + CoRr + + 2016 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Contributors</surname></persName> + </author> + <author> + <persName><surname>Grobid</surname></persName> + </author> + <ptr target="https://github.com/kermitt2/grobid"/> + <imprint> + <date type="published" when="2008">2008 -2023</date> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> \ No newline at end of file From 52ffc23b3febfc6c027f73ada0be02cfac4d2a6c Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Thu, 2 May 2024 16:33:06 +0900 Subject: [PATCH 17/46] revert to the original port --- resources/config/config-docker.yml | 4 ++-- resources/config/config.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/config/config-docker.yml b/resources/config/config-docker.yml index 6909ba8..e20bd5b 100644 --- a/resources/config/config-docker.yml +++ b/resources/config/config-docker.yml @@ -133,10 +133,10 @@ server: idleTimeout: 120 seconds applicationConnectors: - type: http - port: 8065 + port: 8060 adminConnectors: - type: http - port: 8066 + port: 8061 registerDefaultExceptionMappers: false maxThreads: 2048 maxQueuedRequests: 2048 diff --git a/resources/config/config.yml b/resources/config/config.yml index 39639d8..3bb04a3 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -134,10 +134,10 @@ server: idleTimeout: 120 seconds applicationConnectors: - type: http - port: 8065 + port: 8060 adminConnectors: - type: http - port: 8066 + port: 8061 registerDefaultExceptionMappers: false maxThreads: 2048 maxQueuedRequests: 2048 From 444843799398faf3d1b12a8e3d9f99533b717efd Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Thu, 2 May 2024 17:09:05 +0900 Subject: [PATCH 18/46] enable TEI processing in UI - javascript joy --- src/main/resources/web/datastet/datastet.js | 4 ++++ src/main/resources/web/index.html | 21 +++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/main/resources/web/datastet/datastet.js b/src/main/resources/web/datastet/datastet.js index 6295b88..80b2249 100644 --- a/src/main/resources/web/datastet/datastet.js +++ b/src/main/resources/web/datastet/datastet.js @@ -1453,15 +1453,19 @@ var grobid = (function ($) { if (selected == 'annotateDatasetSentence') { createInputTextArea(); + $('#segmentSentencesBlock').hide(); setBaseUrl('annotateDatasetSentence'); } else if (selected == 'annotateDatasetPDF') { createInputFile(selected); + $('#segmentSentencesBlock').hide(); setBaseUrl('annotateDatasetPDF'); } else if (selected == 'processDatasetTEI') { createInputFile(selected); + $('#segmentSentencesBlock').show(); setBaseUrl('processDatasetTEI'); } else if (selected == 'processDatasetJATS') { createInputFile(selected); + $('#segmentSentencesBlock').hide(); setBaseUrl('processDatasetJATS'); } }; diff --git a/src/main/resources/web/index.html b/src/main/resources/web/index.html index 0789c2c..e91919d 100644 --- a/src/main/resources/web/index.html +++ b/src/main/resources/web/index.html @@ -8,7 +8,7 @@ <!--script type="text/javascript" src="resources/js/jquery-1.10.0.min.js"></script> <script type="text/javascript" src="resources/js/jquery.form.js"></script> - <script type="text/javascript" src="resources/bootstrap/js/bootstrap.min.js"></script--> + <script type="text/javascript" src="resources/bootstrap/js/bootstrap.min.js"></script--> <script src="resources/js/jquery-1.8.1.min.js"></script> <script src="resources/js/jquery.form.js"></script> @@ -102,8 +102,8 @@ <h2><small> <select id="selectedService"> <option value="annotateDatasetSentence" selected>Process text sentence</option> <option value="annotateDatasetPDF">Process PDF</option> - <!--option value="processDatasetTEI">Process TEI</option> - <option value="processDatasetJATS">Process JATS/NLM/...</option--> + <option value="processDatasetTEI">Process TEI</option> + <option value="processDatasetJATS">Process JATS/NLM/...</option> </select> </td> </tr> @@ -112,7 +112,20 @@ <h2><small> <td><span id="label"> </span></td> <td> <div id="field"> - + <div id="checkboxes"> + <table> + <tr> + <td> + <div id="segmentSentencesBlock" class="checkbox"> + <input type="checkbox" id="segmentSentences" name="segmentSentences" checked/> + <label for="segmentSentences" value="1"> + Segmented into sentences + </label> + </div> + </td> + </tr> + </table> + </div> <div class="fileupload fileupload-new" data-provides="fileupload" id="fileInputDiv"> <div class="input-append"> <div class="uneditable-input span4" style="white-space:normal;"> From 4aad23dfc01f9421a87e6e45fdcff9a363dd3c4a Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Thu, 2 May 2024 17:28:40 +0900 Subject: [PATCH 19/46] correct parameter --- src/main/resources/web/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/resources/web/index.html b/src/main/resources/web/index.html index e91919d..6deccbc 100644 --- a/src/main/resources/web/index.html +++ b/src/main/resources/web/index.html @@ -117,8 +117,8 @@ <h2><small> <tr> <td> <div id="segmentSentencesBlock" class="checkbox"> - <input type="checkbox" id="segmentSentences" name="segmentSentences" checked/> - <label for="segmentSentences" value="1"> + <input type="checkbox" id="segmentSentences" name="segmentSentences" checked value="1"/> + <label for="segmentSentences"> Segmented into sentences </label> </div> From 6989335ee68087997633d418cadefcd927700ab0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Tue, 7 May 2024 07:48:13 +0900 Subject: [PATCH 20/46] attach URLs obtained from Grobid's TEI --- .../grobid/core/engines/DatasetParser.java | 173 ++++++++++++++++-- .../grobid/core/utilities/XMLUtilities.java | 43 +++-- 2 files changed, 188 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 0cb8061..8cd3f02 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -37,6 +37,8 @@ import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.*; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.utilities.counters.impl.CntManagerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,6 +59,8 @@ import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.grobid.core.utilities.XMLUtilities.BIBLIO_CALLOUT_TYPE; +import static org.grobid.core.utilities.XMLUtilities.URL_TYPE; /** * Identification of the dataset names, implicit dataset expressions and data acquisition device names in text. @@ -491,6 +495,49 @@ private List<DatasetComponent> addUrlComponents(List<LayoutToken> sentenceTokens return existingComponents; } + private List<DatasetComponent> addUrlComponentsAsReferences(List<LayoutToken> sentenceTokens, + List<DatasetComponent> existingComponents, + String text, + Map<String, Triple<OffsetPosition, String, String>> references) { + + // positions for lexical match + List<OffsetPosition> existingPositions = new ArrayList<>(); + for (DatasetComponent existingComponent : existingComponents) { + existingPositions.add(existingComponent.getOffsets()); + } + + for (String keyRef : references.keySet()) { + Triple<OffsetPosition, String, String> urlInfos = references.get(keyRef); + OffsetPosition pos = urlInfos.getLeft(); + String target = urlInfos.getMiddle(); +// String type = urlInfos.getRight(); + + DatasetComponent urlComponent = new DatasetComponent(text.substring(pos.start, pos.end)); + urlComponent.setOffsetStart(pos.start); + urlComponent.setOffsetEnd(pos.end); + if (target != null) { + urlComponent.setDestination(target); + urlComponent.setNormalizedForm(target); + } + + urlComponent.setLabel(DatasetTaggingLabels.DATASET_URL); + urlComponent.setType(DatasetType.URL); + +// urlComponent.setTokens(urlTokens); + +// List<BoundingBox> boundingBoxes = BoundingBoxCalculator.calculate(urlTokens); +// urlComponent.setBoundingBoxes(boundingBoxes); + + if (urlComponent.getNormalizedForm() != null) + urlComponent.setNormalizedForm(urlComponent.getNormalizedForm().replace(" ", "")); + + existingComponents.add(urlComponent); + } + + Collections.sort(existingComponents); + return existingComponents; + } + /** * Sequence labelling of a string for identifying dataset names. */ @@ -1472,7 +1519,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do List<String> selectedSequences = new ArrayList<>(); //The references callout are loaded here, so that we can recover the position in the text // we need target, text value, and position (character related) - List<Map<String, Pair<OffsetPosition, String>>> selectedSequencesReferences = new ArrayList<>(); + List<Map<String, Triple<OffsetPosition, String, String>>> selectedSequencesReferences = new ArrayList<>(); List<Boolean> relevantSectionsNamedDatasets = new ArrayList<>(); List<Boolean> relevantSectionsImplicitDatasets = new ArrayList<>(); @@ -1509,6 +1556,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do org.w3c.dom.Node item = abstractNodeList.item(i); String text = item.getTextContent(); selectedSequences.add(text); + + // Capture URLs if available + //LF Not clear why true, just copied from around ProcessPDF:578 relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(false); @@ -1556,11 +1606,13 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do String text = item.getTextContent(); selectedSequences.add(text); + // Capture URLs if available + //LF Not clear why true, just copied from around ProcessPDF:635 relevantSectionsNamedDatasets.add(true); relevantSectionsImplicitDatasets.add(true); - Map<String, Pair<OffsetPosition, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); selectedSequencesReferences.add(referencesInText); } @@ -1724,9 +1776,14 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do List<BiblioComponent> bibRefComponents = new ArrayList<>(); Map<String, BiblioItem> biblioRefMap = new HashMap<>(); - for(Map<String, Pair<OffsetPosition, String>> ref :selectedSequencesReferences) { + List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequencesReferences.stream() + .filter(map -> map.values().stream() + .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) + .toList(); + + for(Map<String, Triple<OffsetPosition, String, String>> ref :referencesList) { for (String refText : ref.keySet()) { - Pair<OffsetPosition, String> infos = ref.get(refText); + Triple<OffsetPosition, String, String> infos = ref.get(refText); String target = infos.getRight(); OffsetPosition position = infos.getLeft(); @@ -1821,25 +1878,24 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do Map<String, Double> termProfiles = prepareTermProfiles(entities); List<List<OffsetPosition>> placeTaken = preparePlaceTaken(entities); - int index = 0; List<List<Dataset>> newEntities = new ArrayList<>(); - for (List<LayoutToken> sentenceTokens : selectedSequencesLayoutTokens) { + for (int i = 0; i < selectedSequencesReferences.size(); i++) { + List<LayoutToken> sentenceTokens = selectedSequencesLayoutTokens.get(i); List<Dataset> localEntities = propagateLayoutTokenSequence(sentenceTokens, - entities.get(index), + entities.get(i), termProfiles, termPattern, - placeTaken.get(index), + placeTaken.get(i), frequencies, - sentenceOffsetStarts.get(index)); + sentenceOffsetStarts.get(i)); if (localEntities != null) { Collections.sort(localEntities); // revisit and attach URL component - localEntities = attachUrlComponents(localEntities, sentenceTokens, selectedSequences.get(index), new ArrayList<>()); + localEntities = attachUrlComponents(localEntities, sentenceTokens, selectedSequences.get(i), selectedSequencesReferences.get(i)); } newEntities.add(localEntities); - index++; } entities = newEntities; @@ -2598,4 +2654,99 @@ public List<Dataset> attachUrlComponents(List<Dataset> datasets, return datasets; } + public List<Dataset> attachUrlComponents(List<Dataset> datasets, + List<LayoutToken> tokens, + String sentenceString, + Map<String, Triple<OffsetPosition, String, String>> references) { + // revisit url including propagated dataset names + if (datasets == null || datasets.size() == 0) { + return datasets; + } + + // Filter references only of type URLs + Map<String, Triple<OffsetPosition, String, String>> onlyURLs = references.entrySet().stream() + .filter(entry -> { + Triple<OffsetPosition, String, String> triple = entry.getValue(); + return triple.getRight().equals(URL_TYPE); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + if (CollectionUtils.sizeIsEmpty(onlyURLs)) { + return datasets; + } + + for (Dataset dataset : datasets) { + if (dataset == null) + continue; + + // reinit all URL + if (dataset.getUrl() != null) { + dataset.setUrl(null); + } + } + + List<DatasetComponent> localDatasetcomponents = new ArrayList<>(); + for (Dataset dataset : datasets) { + if (dataset.getDataset() != null) + localDatasetcomponents.add(dataset.getDataset()); + if (dataset.getDatasetName() != null) + localDatasetcomponents.add(dataset.getDatasetName()); + if (dataset.getDataDevice() != null) + localDatasetcomponents.add(dataset.getDataDevice()); + if (dataset.getPublisher() != null) + localDatasetcomponents.add(dataset.getPublisher()); + if (dataset.getBibRefs() != null) { + for (BiblioComponent biblio : dataset.getBibRefs()) { + localDatasetcomponents.add(biblio); + } + } + } + + Collections.sort(localDatasetcomponents); + + int sizeBefore = localDatasetcomponents.size(); + localDatasetcomponents = addUrlComponentsAsReferences(tokens, localDatasetcomponents, sentenceString, references); + + // attach URL to the closest dataset + while (localDatasetcomponents.size() - sizeBefore > 0) { + DatasetComponent previousComponent = null; + DatasetComponent urlComponent = null; + for (DatasetComponent localDatasetcomponent : localDatasetcomponents) { + if (localDatasetcomponent.getType() == DatasetType.URL && previousComponent != null) { + urlComponent = localDatasetcomponent; + break; + } + + if (localDatasetcomponent.getType() == DatasetType.DATASET_NAME || localDatasetcomponent.getType() == DatasetType.DATASET) + previousComponent = localDatasetcomponent; + } + + if (previousComponent != null && urlComponent != null) { + + // URL attachment + for (Dataset dataset : datasets) { + if (dataset.getDataset() != null && previousComponent.getType() == DatasetType.DATASET) { + if (dataset.getDataset().getOffsetStart() == previousComponent.getOffsetStart() && + dataset.getDataset().getOffsetEnd() == previousComponent.getOffsetEnd()) { + dataset.setUrl(urlComponent); + break; + } + } else if (dataset.getDatasetName() != null && previousComponent.getType() == DatasetType.DATASET_NAME) { + if (dataset.getDatasetName().getOffsetStart() == previousComponent.getOffsetStart() && + dataset.getDatasetName().getOffsetEnd() == previousComponent.getOffsetEnd()) { + dataset.setUrl(urlComponent); + break; + } + } + } + + // remove attached URL from components + localDatasetcomponents.remove(urlComponent); + } else { + break; + } + } + return datasets; + } + } diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index e5e9c2b..19a46fe 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -3,6 +3,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.data.BiblioItem; import org.grobid.core.sax.BiblStructSaxHandler; import org.slf4j.Logger; @@ -39,6 +40,10 @@ public class XMLUtilities { private static final Logger LOGGER = LoggerFactory.getLogger(XMLUtilities.class); + public static final String BIBLIO_CALLOUT_TYPE = "bibr"; + public static final String URL_TYPE = "url"; + private static final String URI_TYPE = "uri"; + public static String toPrettyString(String xml, int indent) { try { // Turn xml string into a document @@ -142,44 +147,48 @@ public static String getTextNoRefMarkers(Element element) { return found ? buf.toString() : null; } - public static Pair<String,Map<String,Pair<OffsetPosition,String>>> getTextNoRefMarkersAndMarkerPositions(Element element, int globalPos) { + public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> getTextNoRefMarkersAndMarkerPositions(Element element, int globalPos) { StringBuffer buf = new StringBuffer(); - NodeList list = element.getChildNodes(); + NodeList nodeChildren = element.getChildNodes(); boolean found = false; int indexPos = globalPos; // map a ref string with its position and the reference key as present in the XML - Map<String,Pair<OffsetPosition,String>> right = new TreeMap<>(); + Map<String, Triple<OffsetPosition,String, String>> right = new TreeMap<>(); // the key of the reference - String bibId = null; + String target = null; - for (int i = 0; i < list.getLength(); i++) { - Node node = list.item(i); + for (int i = 0; i < nodeChildren.getLength(); i++) { + Node node = nodeChildren.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) { if ("ref".equals(node.getNodeName())) { - if ("bibr".equals(((Element) node).getAttribute("type"))) { - bibId = ((Element) node).getAttribute("target"); - if (bibId != null && bibId.startsWith("#")) { - bibId = bibId.substring(1, bibId.length()); + if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { + target = ((Element) node).getAttribute("target"); + if (target != null && target.startsWith("#")) { + target = target.substring(1, target.length()); } + } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { + target = ((Element) node).getAttribute("target"); } // get the ref marker text NodeList list2 = node.getChildNodes(); for (int j = 0; j < list2.getLength(); j++) { - Node node2 = list2.item(j); - if (node2.getNodeType() == Node.TEXT_NODE) { - String chunk = node2.getNodeValue(); + Node subChildNode = list2.item(j); + if (subChildNode.getNodeType() == Node.TEXT_NODE) { + String chunk = subChildNode.getNodeValue(); - if ("bibr".equals(((Element) node).getAttribute("type"))) { - Pair<OffsetPosition, String> refInfo = Pair.of(new OffsetPosition(indexPos, indexPos+chunk.length()), bibId); + if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { + Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); right.put(chunk, refInfo); String holder = StringUtils.repeat(" ", chunk.length()); buf.append(holder); - } else if ("uri".equals(((Element) node).getAttribute("type")) || "url".equals(((Element) node).getAttribute("type"))) { - // added like normal text + } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { + org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); + right.put(chunk, urlInfo); + // we still add added like normal text buf.append(chunk); found = true; } else { From 7f0cdd5aadc7842a670ed9790cc59e3ff5ba1d4e Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Tue, 7 May 2024 09:22:50 +0900 Subject: [PATCH 21/46] fix frontend --- .../controller/DatastetProcessFile.java | 2 + src/main/resources/web/datastet/datastet.js | 44 +++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index cbfeca4..f32d7ed 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -28,6 +28,8 @@ import java.io.InputStream; import java.security.DigestInputStream; import java.security.MessageDigest; +import java.sql.Array; +import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; diff --git a/src/main/resources/web/datastet/datastet.js b/src/main/resources/web/datastet/datastet.js index 80b2249..ace4c93 100644 --- a/src/main/resources/web/datastet/datastet.js +++ b/src/main/resources/web/datastet/datastet.js @@ -213,11 +213,12 @@ var grobid = (function ($) { var url = urlLocal xhr.responseType = 'xml'; xhr.open('POST', url, true); + xhr.setRequestHeader('Accept', 'application/json'); xhr.onreadystatechange = function (e) { if (xhr.readyState == 4 && xhr.status == 200) { var response = e.target.response; - //console.log(response); + console.log(response); SubmitSuccesful(response, xhr.status); } else if (xhr.status != 200) { AjaxError("Response " + xhr.status + ": "); @@ -423,14 +424,16 @@ var grobid = (function ($) { function SubmitSuccesful(responseText, statusText) { var selected = $('#selectedService option:selected').attr('value'); + console.log(selected); + if (selected == 'annotateDatasetSentence') { SubmitSuccesfulText(responseText, statusText); } else if (selected == 'annotateDatasetPDF') { - //SubmitSuccesfulXML(responseText, statusText); + SubmitSuccesfulXML(responseText, statusText); } else if (selected == 'processDatasetTEI') { - //SubmitSuccesfulXML(responseText, statusText); + submitSuccesfulJSON(responseText, statusText); } else if (selected == 'processDatasetJATS') { - //SubmitSuccesfulXML(responseText, statusText); + SubmitSuccesfulXML(responseText, statusText); } } @@ -698,6 +701,39 @@ var grobid = (function ($) { $('#requestResult2').show(); } + function submitSuccesfulJSON(responseText, statusText) { + if ((responseText == null) || (responseText.length == 0)) { + $('#infoResult') + .html("<font color='red'>Error encountered while receiving the server's answer: response is empty.</font>"); + return; + } else { + $('#infoResult').html(''); + } + + var display = '<div class=\"note-tabs\"> \ + <ul id=\"resultTab\" class=\"nav nav-tabs\"> \ + <li class="active"><a href=\"#navbar-fixed-xml\" data-toggle=\"tab\">Response</a></li> \ + </ul> \ + <div class="tab-content"> \ + <div class="tab-pane active" id="navbar-fixed-annotation">\n'; + + + display += '<div class="tab-pane " id="navbar-fixed-xml">\n'; + display += "<pre class='prettyprint' id='xmlCode'>"; + display += "<pre class='prettyprint lang-xml' id='xmlCode'>"; + var testStr = vkbeautify.json(responseText); + + display += htmll(testStr); + + display += "</pre>"; + display += '</div></div></div>'; + + $('#requestResult').html(display); + window.prettyPrint && prettyPrint(); + + $('#requestResult').show(); + } + function fetchConcept(identifier, lang, successFunction) { $.ajax({ type: 'GET', From 1c5ff72213f3fff8308e9b2451feb72dc606c39b Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Tue, 7 May 2024 10:15:58 +0900 Subject: [PATCH 22/46] fix github action --- .github/workflows/ci-build-manual.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml index 7f596c1..293580f 100644 --- a/.github/workflows/ci-build-manual.yml +++ b/.github/workflows/ci-build-manual.yml @@ -31,7 +31,7 @@ jobs: id: docker_build uses: mr-smithers-excellent/docker-build-push@v6 with: - dockerfile: Dockerfile.local + dockerfile: Dockerfile.datastet username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} image: lfoppiano/datastet From 4cd73903e13c151429b55d198438d6cedeb61585 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Thu, 9 May 2024 10:38:59 +0900 Subject: [PATCH 23/46] fix wrong ifs - thanks intellij! --- src/main/java/org/grobid/core/engines/DatasetParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 8cd3f02..54e0799 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1116,7 +1116,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, int i = 0; for (List<Dataset> localDatasets : entities) { - if (CollectionUtils.isNotEmpty(localDatasets)) { + if (CollectionUtils.isEmpty(localDatasets)) { i++; continue; } @@ -1851,7 +1851,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < entities.size(); i++) { List<Dataset> localDatasets = entities.get(i); - if (CollectionUtils.isNotEmpty(localDatasets)) { + if (CollectionUtils.isEmpty(localDatasets)) { continue; } for (Dataset localDataset : localDatasets) { From df86b81cf746419e143fbe6153a60c4d930f6ac0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Thu, 9 May 2024 13:36:19 +0900 Subject: [PATCH 24/46] avoid exception when entities are empty --- src/main/java/org/grobid/core/engines/DatasetParser.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 54e0799..0821b23 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -2287,6 +2287,10 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) { FastMatcher termPattern = new FastMatcher(); List<String> added = new ArrayList<>(); for (List<Dataset> datasets : entities) { + if (CollectionUtils.isEmpty(datasets)){ + continue; + } + for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent == null) From 843463c0a04279c2d2691c135f193f6461596638 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Thu, 9 May 2024 14:01:35 +0900 Subject: [PATCH 25/46] avoid injecting null stuff --- src/main/java/org/grobid/core/engines/DatasetParser.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 0821b23..d2f1430 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -186,8 +186,8 @@ public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, List<P int i = 0; for (List<LayoutToken> tokens : tokensList) { - if (tokens == null || tokens.size() == 0) { - results.add(null); + if (CollectionUtils.isEmpty(tokens)) { + results.add(new ArrayList<>()); } else { String text = LayoutTokensUtil.toText(tokens); List<DatasetComponent> localDatasetcomponents = new ArrayList<>(); @@ -2358,6 +2358,9 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) { public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, List<LayoutToken> tokens) { Map<String, Integer> frequencies = new TreeMap<String, Integer>(); for (List<Dataset> datasets : entities) { + if (CollectionUtils.isEmpty(datasets)){ + continue; + } for (Dataset entity : datasets) { DatasetComponent nameComponent = entity.getDatasetName(); if (nameComponent == null) From 1b1da5fbbb540df34fd87de2197b9db7e7d93039 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Mon, 13 May 2024 07:21:54 +0900 Subject: [PATCH 26/46] reduce the timeout for checking the disambiguation service --- .../core/engines/DatasetDisambiguator.java | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index 4c489f0..4744b25 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -3,6 +3,8 @@ import nu.xom.Attribute; import nu.xom.Element; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.client.config.RequestConfig; import org.grobid.core.GrobidModels; import org.grobid.core.data.DatasetComponent; import org.grobid.core.data.Dataset; @@ -104,7 +106,7 @@ private DatasetDisambiguator(DatastetConfiguration configuration) { nerd_host = configuration.getEntityFishingHost(); nerd_port = configuration.getEntityFishingPort(); serverStatus = checkIfAlive(); - if (serverStatus == true) + if (serverStatus) ensureCustomizationReady(); } catch(Exception e) { LOGGER.error("Cannot read properties for disambiguation service", e); @@ -120,43 +122,45 @@ public boolean checkIfAlive() { boolean result = false; try { URL url = null; - if ( (nerd_port != null) && (nerd_port.length() > 0) ) - if (nerd_port.equals("443")) + if (StringUtils.isNotBlank(nerd_port)) { + if (nerd_port.equals("443")) { url = new URL("https://" + nerd_host + "/service/isalive"); - else + } else { url = new URL("http://" + nerd_host + ":" + nerd_port + "/service/isalive"); - else + } + } else url = new URL("http://" + nerd_host + "/service/isalive"); - LOGGER.debug("Calling: " + url.toString()); -//System.out.println("Calling: " + url.toString()); - CloseableHttpClient httpClient = HttpClients.createDefault(); - HttpGet get = new HttpGet(url.toString()); + LOGGER.debug("Calling: " + url); - CloseableHttpResponse response = null; - Scanner in = null; - try { - response = httpClient.execute(get); -//System.out.println(response.getStatusLine()); - int code = response.getStatusLine().getStatusCode(); - if (code != 200) { - LOGGER.error("Failed isalive service for disambiguation service entity-fishing, HTTP error code : " + code); - return false; - } else { - result = true; + int timeout = 5; + RequestConfig config = RequestConfig.custom() + .setConnectTimeout(timeout * 100) + .setConnectionRequestTimeout(timeout * 100) + .setSocketTimeout(timeout * 100).build(); + + try (CloseableHttpClient httpClient = HttpClientBuilder.create() + .setDefaultRequestConfig(config) + .build();) { + HttpGet get = new HttpGet(url.toString()); + + try (CloseableHttpResponse response = httpClient.execute(get)) { + int code = response.getStatusLine().getStatusCode(); + if (code != 200) { + LOGGER.error("Failed isalive service for disambiguation service entity-fishing, HTTP error code : " + code); + return false; + } else { + result = true; + } } - } finally { - if (in != null) - in.close(); - if (response != null) - response.close(); } + } catch (MalformedURLException e) { - LOGGER.error("disambiguation service not available: MalformedURLException"); + LOGGER.error("Disambiguation service not available: MalformedURLException"); } catch (HttpHostConnectException e) { - LOGGER.error("cannot connect to the disambiguation service"); + LOGGER.error("Cannot connect to the disambiguation service"); } catch(Exception e) { - LOGGER.error("disambiguation service not available", e); + LOGGER.error("Disambiguation service not available: generic error", e); } return result; From 75dd7112df8b1700154ee930ddaa54eb5fbce472 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Mon, 20 May 2024 09:07:08 +0900 Subject: [PATCH 27/46] fix the convention for sentence segmentation and enable it --- .../grobid/core/engines/DatasetParser.java | 25 +++++++++++++------ src/main/resources/web/index.html | 4 +-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index d2f1430..cb6de01 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -59,8 +59,7 @@ import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.grobid.core.utilities.XMLUtilities.BIBLIO_CALLOUT_TYPE; -import static org.grobid.core.utilities.XMLUtilities.URL_TYPE; +import static org.grobid.core.utilities.XMLUtilities.*; /** * Identification of the dataset names, implicit dataset expressions and data acquisition device names in text. @@ -1416,8 +1415,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processXML(File file, boolean org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(tei))); //document.getDocumentElement().normalize(); + // TODO: call pub2TEI with sentence segmentation + // It's likely that JATS don't have sentences - resultExtraction = processTEIDocument(document, true, disambiguate, addParagraphContext); + resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext); } catch (final Exception exp) { LOGGER.error("An error occured while processing the following XML file: " + file.getPath(), exp); @@ -1432,8 +1433,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEI(File file, boolean factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(file); - //document.getDocumentElement().normalize(); - resultExtraction = processTEIDocument(document, segmentSentences, disambiguate, addParagraphContext); + org.w3c.dom.Element root = document.getDocumentElement(); + if (segmentSentences) + segment(document, root); + resultExtraction = processTEIDocument(document, disambiguate, addParagraphContext); //tei = restoreDomParserAttributeBug(tei); } catch (final Exception exp) { @@ -1494,7 +1497,11 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(documentAsString))); //document.getDocumentElement().normalize(); - tei = processTEIDocument(document, segmentSentences, disambiguate, addParagraphContext); + org.w3c.dom.Element root = document.getDocumentElement(); + if (segmentSentences) + segment(document, root); + + tei = processTEIDocument(document, disambiguate, addParagraphContext); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (IOException e) { @@ -1512,7 +1519,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String doc * LF: This method attempt to reproduce the extraction from PDF in processPDF but with an already extracted TEI as input */ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.dom.Document doc, - boolean segmentSentences, boolean disambiguate, boolean addParagraphContext) { @@ -1526,6 +1532,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do //Extract relevant section from the TEI // Title, abstract, keywords + // If we process the TEI, at this point the document should be already segmented correctly. + boolean segmentSentences = true; + XPath xPath = XPathFactory.newInstance().newXPath(); try { @@ -1658,7 +1667,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Check the head? String currentSection = null; - org.w3c.dom.Node head = (org.w3c.dom.Node) xPath.evaluate("//*[local-name() = 'head']", item, XPathConstants.NODE); + org.w3c.dom.Node head = (org.w3c.dom.Node) xPath.evaluate("./*[local-name() = 'head']", item, XPathConstants.NODE); if (head != null) { String headText = head.getTextContent(); diff --git a/src/main/resources/web/index.html b/src/main/resources/web/index.html index 6deccbc..d19994a 100644 --- a/src/main/resources/web/index.html +++ b/src/main/resources/web/index.html @@ -117,9 +117,9 @@ <h2><small> <tr> <td> <div id="segmentSentencesBlock" class="checkbox"> - <input type="checkbox" id="segmentSentences" name="segmentSentences" checked value="1"/> + <input type="checkbox" id="segmentSentences" name="segmentSentences" value="1"/> <label for="segmentSentences"> - Segmented into sentences + Segment into sentence (useful when the TEI is NOT already segmented) </label> </div> </td> From 758f418c35b0f9805279f0233ded1a4a95c3bddf Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 21 May 2024 14:58:54 +0900 Subject: [PATCH 28/46] update examples --- ...ntific literature.sample.segmented.tei.xml | 201 +++-- ...om scientific literature.segmented.tei.xml | 260 +++--- ...raction from scientific literature.tei.xml | 760 ------------------ 3 files changed, 246 insertions(+), 975 deletions(-) delete mode 100644 resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml diff --git a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.sample.segmented.tei.xml b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.sample.segmented.tei.xml index 664061f..20a3c41 100644 --- a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.sample.segmented.tei.xml +++ b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.sample.segmented.tei.xml @@ -1,22 +1,26 @@ <?xml version="1.0" encoding="UTF-8"?> -<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink"> +<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" +xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" + xmlns:xlink="http://www.w3.org/1999/xlink"> <teiHeader xml:lang="en"> <fileDesc> <titleStmt> - <title level="a" type="main" coords="1,127.92,45.60,407.40,14.93;1,97.10,173.24,385.43,17.91;1,97.10,194.24,356.44,17.91;1,97.10,215.24,81.54,17.91;2,59.98,104.63,472.55,12.94;2,59.98,120.62,113.99,12.94">Science and Technology of Advanced Materials: Methods + Science and Technology of Advanced Materials: Methods - MEXT + Ministry of Education, Culture, Sports, Science and Technology + MEXT Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) - + unknown - + 14 Dec 2023. @@ -27,7 +31,7 @@ LucaFoppiano 0000-0002-6114-6164 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -37,7 +41,7 @@ Japan; - + Knowledge and Data Engineering Centre for Computational Sciences University of Tsukuba @@ -50,7 +54,7 @@ TomoyaMato 0000-0002-0918-6468 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -64,7 +68,7 @@ KenseiTerashima 0000-0003-0375-3043 - + Frontier Superconducting Materials Group MANA NIMS @@ -77,7 +81,7 @@ PedroOrtiz Suarez 0000-0003-0343-8852 - + GmbH DFKI CONTACT Luca Foppiano
@@ -90,7 +94,7 @@ TakuTou - + Frontier Superconducting Materials Group MANA NIMS @@ -102,7 +106,7 @@ ChikakoSakai - + Frontier Superconducting Materials Group MANA NIMS @@ -115,7 +119,7 @@ Wei-ShengWang 0009-0001-3572-5736 - + Frontier Superconducting Materials Group MANA NIMS @@ -128,7 +132,7 @@ ToshiyukiAmagasa 0000-0003-0595-2230 - + Knowledge and Data Engineering Centre for Computational Sciences University of Tsukuba @@ -141,7 +145,7 @@ YoshihikoTakano 0000-0002-1541-6928 - + Frontier Superconducting Materials Group MANA NIMS @@ -155,7 +159,7 @@ MasashiIshii ishii.masashi@nims.go.jp 0000-0003-0357-2832 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -169,7 +173,7 @@ Masashi - Science and Technology of Advanced Materials: Methods + Science and Technology of Advanced Materials: Methods Print @@ -185,7 +189,7 @@ - + GROBID - A machine learning software for extracting information from scholarly documents @@ -194,16 +198,16 @@ - Materials informatics - superconductors - machine learning - database - TDM + Materials informatics + superconductors + machine learning + database + TDM -

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

-
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

+

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon.Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work.This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections.Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples.For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer.We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'.Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

+
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon.We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

@@ -224,32 +228,55 @@ -
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

-
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2].This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties.As a matter of course, such an approach requires a larger amount of material-related data for training models.Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics.Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects.Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality.Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments.This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6].Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods.This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12].However, the updates of SuperCon have become increasingly challenging due to the high publication rate.In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications.This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1].Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process.We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record.At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information.There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file.We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database.We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach.The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it.Finally, we discuss our evaluation experiments and results in Section 4.

+
Code availability

This work is available at https://github.com/lfoppiano/ supercon2.The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2.Data support, the number of entities for each label in each of the datasets used for evaluating the ML models.The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Figure 2 .

Figure 2.Screenshot of the training data management page in the SuperCon 2 interface.Each row contains one potential training data example.Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17].The column 'status' indicate whether the example has been sent or not to the external tool.

+
Figure 3 .

Figure 3.Screenshot of SuperCon 2 interface showing the database.Each row corresponds to one material-T c pair.On top, there are searches by attribute, sorting and other filtering operations.On the right there are curation controls (mark as valid, update, etc.).Records are grouped by document with alternating light yellow and white.

+
Figure 5 .

Figure 5.Sample curation sheet from the curation guidelines.The sheet is composed of the following information:(a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface',(b) Context represented by the related part of the annotated document referring to the record in exams.(c) The Motivation, describing the issue,(d) The Action to be taken, and the expected output.

+
Figure 4 .

Figure 4.PDF document viewer showing an annotated document.The table on top is linked through the annotated entities.The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.

+
( 1 )

base(s): using the base dataset and training from scratch (s).(2) (base+curation)(s): using both the base and curation datasets and training from scratch (s).(3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).

+
Figure 6 .

Figure 6.Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred.Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates.By clicking on the 'record id', is possible to visualise the latest record values.

+

+
c classification: The temperature is not correctly classified

The material is incorrectly linked to the T c given that the entities are correctly recognised.

• From table: the entities Material ! T c !Pressure are identified in a table. At the moment,table extraction is not performed• Extraction: The material, temperature, and pressureare not extracted (no box) or extracted incorrectly.

• Linking: • T as 'superconductors critical temperature' (e.g. Curie temperature, Magnetic temperature. . .).

+
Table 4 .

Evaluation

basebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher).Each person corrected 10 documents.

+
Table 3 .

Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).

MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .

Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).

ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .

Evaluation scores obtained for each document and method (I: interface, P: PDF) combination.TP: true positive, FP: false positive, FN: false negative.P: precision, R: recall, F1: F1-score.

Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci.Technol.Adv.Mater.Meth.3 (2023) 2 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 3 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 5 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 6 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 9L.FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 10 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 12 L. FOPPIANO et al.

-
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

-

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

-
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

- + 305-0044 +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing.LF and POS discussed the ML results and experiments.LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface.LF designed the user interface experiment with KT, TT and WS as curators.KT led the materials-science work on the data with CS, TT and WS.KT, TA, YT and MI revised the paper.YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1.Timetable recording the time spent for each of the 15 articles.Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR).Duration is expressed in minutes.

- Automatic extraction of materials and properties from superconductors scientific literature + Automatic extraction of materials and properties from superconductors scientific literature LFoppiano @@ -262,7 +289,7 @@ 10.1080/27660400.2022.2153633 - Sci Technol Adv Mater + Sci Technol Adv Mater 3 1 @@ -274,7 +301,7 @@ - Materials discovery with machine learning and knowledge discovery + Materials discovery with machine learning and knowledge discovery ONOliveira @@ -284,7 +311,7 @@ 10.3389/fchem.2022.930369 - Front Chem + Front Chem 10 10 @@ -295,7 +322,7 @@ - Commentary: the materials project: a materials genome approach to accelerating materials innovation + Commentary: the materials project: a materials genome approach to accelerating materials innovation AJain @@ -308,7 +335,7 @@ 10.1063/1.4812323 - APL Mater + APL Mater 1 1 @@ -320,7 +347,7 @@ - Aflow: an automatic framework for high-throughput materials discovery + Aflow: an automatic framework for high-throughput materials discovery SCurtarolo @@ -330,13 +357,13 @@ GLHart - + - Comput Mater Sci + Comput Mater Sci 58 - + 2012 @@ -344,7 +371,7 @@ - The nomad laboratory: from data sharing to artificial intelligence + The nomad laboratory: from data sharing to artificial intelligence CDraxl @@ -354,7 +381,7 @@ 10.1088/2515-7639/ab13bb - J Phys Mater + J Phys Mater 2 3 @@ -366,18 +393,18 @@ - Global publication productivity in materials science research: a scientometric analysis + Global publication productivity in materials science research: a scientometric analysis TPratheepan - + - Indian J Inf Sources Serv + Indian J Inf Sources Serv 9 1 - + 2019 Feb @@ -385,7 +412,7 @@ - The PAULING FILE project and materials platform for data science: from big data toward materials genome + The PAULING FILE project and materials platform for data science: from big data toward materials genome EBlokhin @@ -396,7 +423,7 @@ 2018 Springer International Publishing - + Cham @@ -404,7 +431,7 @@ - Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases MIshii @@ -414,7 +441,7 @@ 10.1080/27660400.2023.2223051 - Sci Technol Adv Mater + Sci Technol Adv Mater 3 1 @@ -426,7 +453,7 @@ - Predicting new superconductors and their critical temperatures using machine learning + Predicting new superconductors and their critical temperatures using machine learning BRoter @@ -436,7 +463,7 @@ 10.1016/j.physc.2020.1353689 - Phys C + Phys C 575 1353689 @@ -447,7 +474,7 @@ - Machine learning modeling of superconducting critical temperature + Machine learning modeling of superconducting critical temperature VStanev @@ -460,7 +487,7 @@ 10.1038/s41524-018-0085-8 - Npj Comput Mater + Npj Comput Mater 4 1 @@ -472,7 +499,7 @@ - Machine-learning approach for discovery of conventional superconductors + Machine-learning approach for discovery of conventional superconductors HTran @@ -487,7 +514,7 @@ - Deep learning model for finding new superconductors + Deep learning model for finding new superconductors TKonno @@ -500,7 +527,7 @@ 10.1103/PhysRevB.103.014509 - Phys Rev B + Phys Rev B 103 1 @@ -512,7 +539,7 @@ - The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation JCKlie @@ -522,21 +549,21 @@ BBoullosa - + - Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
2018 - +
- Doccano: text annotation tool for human + Doccano: text annotation tool for human HNakayama @@ -546,10 +573,10 @@ JKamura - + - Software + Software 2018 @@ -558,7 +585,7 @@ - Python materials genomics pymatgen: a robust open-source python library for materials analysis + Python materials genomics pymatgen: a robust open-source python library for materials analysis SPOng @@ -571,11 +598,11 @@ 10.1016/j.commatsci.2012.10.028 - Comput Mater Sci + Comput Mater Sci 68 2 - + 2013 @@ -583,7 +610,7 @@ - Text-mined dataset of inorganic materials synthesis recipes. Sci Data + Text-mined dataset of inorganic materials synthesis recipes. Sci Data OKononova @@ -595,7 +622,7 @@ 10.1038/s41597-019-0224-1 41597-019-0224-1 - + 2019 Oct 6 @@ -606,7 +633,7 @@ - Label studio: data labeling software; 2020-2022 + Label studio: data labeling software; 2020-2022 MTkachenko @@ -616,17 +643,17 @@ AHolmanyuk - + - Open source software + Open source software - Supermat: construction of a linked annotated dataset from superconductors-related publications + Supermat: construction of a linked annotated dataset from superconductors-related publications LFoppiano @@ -639,11 +666,11 @@ 10.1080/27660400.2021.1918396 - Sci Technol Adv Mater: Methods + Sci Technol Adv Mater: Methods 1 1 - + 2021 @@ -651,7 +678,7 @@ - SciBERT: a pretrained language model for scientific text + SciBERT: a pretrained language model for scientific text IBeltagy @@ -661,15 +688,15 @@ ACohan - + - Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
Association for Computational Linguistics Nov. 2019 - +
@@ -677,10 +704,10 @@ - <ptr target="https://github.com/kermitt2/delft"/> + <ptr target="https://github.com/kermitt2/delft" /> </analytic> <monogr> - <title level="j" coords="11,331.03,444.75,110.24,8.78">DeLFT contributors. Delft + DeLFT contributors. Delft 2018-2023 @@ -689,7 +716,7 @@ - Overcoming catastrophic forgetting in neural networks + Overcoming catastrophic forgetting in neural networks JKirkpatrick @@ -700,10 +727,10 @@ NCRabinowitz abs/1612.00796 - + - CoRr + CoRr 2016 @@ -719,7 +746,7 @@ Grobid - + 2008 -2023 @@ -730,4 +757,4 @@
- \ No newline at end of file + diff --git a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.segmented.tei.xml b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.segmented.tei.xml index da6c5d8..97f847d 100644 --- a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.segmented.tei.xml +++ b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.segmented.tei.xml @@ -1,22 +1,26 @@ - + - Science and Technology of Advanced Materials: Methods + Science and Technology of Advanced Materials: Methods - MEXT + Ministry of Education, Culture, Sports, Science and Technology + MEXT Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) - + unknown - + 14 Dec 2023. @@ -27,7 +31,7 @@ LucaFoppiano 0000-0002-6114-6164 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -37,7 +41,7 @@ Japan;
- + Knowledge and Data Engineering Centre for Computational Sciences University of Tsukuba @@ -50,7 +54,7 @@ TomoyaMato 0000-0002-0918-6468 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -64,7 +68,7 @@ KenseiTerashima 0000-0003-0375-3043 - + Frontier Superconducting Materials Group MANA NIMS @@ -77,7 +81,7 @@ PedroOrtiz Suarez 0000-0003-0343-8852 - + GmbH DFKI CONTACT Luca Foppiano
@@ -90,7 +94,7 @@ TakuTou - + Frontier Superconducting Materials Group MANA NIMS @@ -102,7 +106,7 @@ ChikakoSakai - + Frontier Superconducting Materials Group MANA NIMS @@ -115,7 +119,7 @@ Wei-ShengWang 0009-0001-3572-5736 - + Frontier Superconducting Materials Group MANA NIMS @@ -128,7 +132,7 @@ ToshiyukiAmagasa 0000-0003-0595-2230 - + Knowledge and Data Engineering Centre for Computational Sciences University of Tsukuba @@ -141,7 +145,7 @@ YoshihikoTakano 0000-0002-1541-6928 - + Frontier Superconducting Materials Group MANA NIMS @@ -155,7 +159,7 @@ MasashiIshii ishii.masashi@nims.go.jp 0000-0003-0357-2832 - + Materials Modelling Group Centre for Basic Research on Materials Data-driven Materials Research Field @@ -169,7 +173,7 @@ Masashi - Science and Technology of Advanced Materials: Methods + Science and Technology of Advanced Materials: Methods Print @@ -185,7 +189,7 @@ - + GROBID - A machine learning software for extracting information from scholarly documents @@ -194,16 +198,16 @@ - Materials informatics - superconductors - machine learning - database - TDM + Materials informatics + superconductors + machine learning + database + TDM -

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

-
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

+

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon.Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work.This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections.Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples.For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer.We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'.Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

+
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon.We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

@@ -224,74 +228,74 @@ -
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

-
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

-
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

-
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

-
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

-
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

-
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

-
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

-
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

-
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

-
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

-
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

-
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

-
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

-
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

-
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

-
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

-
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

-
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

-
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

-
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
-
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
-
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
-
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
-
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
-
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
-
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
-
-
c classification: The temperature is not correctly classified
-
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

-
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
-
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
-
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

+
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2].This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties.As a matter of course, such an approach requires a larger amount of material-related data for training models.Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics.Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects.Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality.Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments.This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6].Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods.This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12].However, the updates of SuperCon have become increasingly challenging due to the high publication rate.In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications.This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1].Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process.We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record.At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information.There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file.We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database.We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach.The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it.Finally, we discuss our evaluation experiments and results in Section 4.

+
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1).Allowed manual actions includea) mark as valid (validation) when a record is considered correct or corrected by someone else.When a record is not valid, users can:b) mark as invalid when considered 'potentially' invalid (or the curator is not confident),c) perform manual correction to update it according to the information from the original PDF document, andd) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

+
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

+
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ).• removed: the record has been removed by a curator (internal status).

+
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection.Users are required to select one Error Type at every record update or removal.This information is stored in the 'original' record and can be different at every record modification.The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper.This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface.• Curation amends: The curator is updating the data which does not present issues due to the automatic system.

+
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data.In our context, this means identifying data that are greatly different from the expected values.This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification.Since this process may find false positives, its output requires validation from curators.For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]).The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values.Further analysis and cross-references with contrasting information may be added in future.

+
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort.To maximise the use of this time for collecting as much information as possible.We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

+
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information.This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

+
Training data management

We designed a specific page of the interface (Section3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status.The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them.We integrated our interface with Labelstudio [17] for the correction of the collected training examples.Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

+
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1).It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3).The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence).Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

+
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality.Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result.We followed two principles to guarantee robustness in the curation process.First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18].Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

+
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions.The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2.Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

+
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the .Each row contains one potential training data example.Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17].The column 'status' indicate whether the example has been sent or not to the external tool.curation process (curation log).The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top).For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

+
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work.The evaluation is composed of three sets of results.The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation.Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file.In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

+
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation.Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas.Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure.This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

+
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2).Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models.We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1].We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations.We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction.A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly.We denote this strategy with an (s).(2) 'incremental': when the initial model weights are taken from an already existing model.We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint.We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting.The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1.F1-score from the evaluation of the fine-tuned SciBERT models.The training is performed with three different approaches.The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.s indicate 'training from scratch', while i indicate 'incremental training'.The evaluation is performed using the same holdout dataset from SuperMat [18].The results are averaged over five runs or train and evaluation.77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively.This experiment gives interesting insight relative to the positive impact on the way we select the training data.However, there are some limitations: the curation dataset is small compared to the base dataset.This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores.A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

+
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods:a) the user interface (interface), andb) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS).Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method.Overall, each pair of curators had five papers in common which they had to process using opposite methods.For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method.After curation, a fourth individual manually reviewed the curated content.The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness.Time was calculated as the accumulated minutes required using each method.Correctness was assessed using standard measures such as precision, recall, and the F1-score.Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information.F1-Score is a harmonic means of precision and recall.

+
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method.When the experiment was carried out, not all the curators were familiar with the interface method.Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3).The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills.Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4).Furthermore, we observed a modest improvement between master's students and PhD students.These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice.Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

+
Code availability

This work is available at https://github.com/lfoppiano/ supercon2.The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2.Data support, the number of entities for each label in each of the datasets used for evaluating the ML models.The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8].The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer.Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model.Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively.In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

+
Notes

1. 'internal status' indicates that their records should be hidden in the interface.2. In our previous work [1] we reported 77.03% F1score.There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0.One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate.However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues https://github.com/kermitt2/delft/issues/150.

Figure 1 .

Figure 1.Schema of the curation workflow.Each node has two properties: type and status (Section 2.1.1).Each edge indicates one action.The workflow starts on the left side of the figure.The new records begin with 'automatic, new'.Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node.Each combination of property values identifies each state.'(*)' indicates a transition for which the training data are collected (Section 2.3).

+
Figure 2 .

Figure 2.Screenshot of the training data management page in the SuperCon 2 interface.Each row contains one potential training data example.Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17].The column 'status' indicate whether the example has been sent or not to the external tool.

+
Figure 3 .

Figure 3.Screenshot of SuperCon 2 interface showing the database.Each row corresponds to one material-T c pair.On top, there are searches by attribute, sorting and other filtering operations.On the right there are curation controls (mark as valid, update, etc.).Records are grouped by document with alternating light yellow and white.

+
Figure 5 .

Figure 5.Sample curation sheet from the curation guidelines.The sheet is composed of the following information:(a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface',(b) Context represented by the related part of the annotated document referring to the record in exams.(c) The Motivation, describing the issue,(d) The Action to be taken, and the expected output.

+
Figure 4 .

Figure 4.PDF document viewer showing an annotated document.The table on top is linked through the annotated entities.The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.

+
( 1 )

base(s): using the base dataset and training from scratch (s).(2) (base+curation)(s): using both the base and curation datasets and training from scratch (s).(3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).

+
Figure 6 .

Figure 6.Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred.Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates.By clicking on the 'record id', is possible to visualise the latest record values.

+

+
c classification: The temperature is not correctly classified

The material is incorrectly linked to the T c given that the entities are correctly recognised.

• From table: the entities Material ! T c !Pressure are identified in a table. At the moment,table extraction is not performed• Extraction: The material, temperature, and pressureare not extracted (no box) or extracted incorrectly.

• Linking: • T as 'superconductors critical temperature' (e.g. Curie temperature, Magnetic temperature. . .).

+
Table 4 .

Evaluation

basebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher).Each person corrected 10 documents.

+
Table 3 .

Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).

MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .

Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).

ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .

Evaluation scores obtained for each document and method (I: interface, P: PDF) combination.TP: true positive, FP: false positive, FN: false negative.P: precision, R: recall, F1: F1-score.

Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci.Technol.Adv.Mater.Meth.3 (2023) 2 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 3 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 5 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 6 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 9L.FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 10 L. FOPPIANO et al.

+

Sci.Technol.Adv.Mater.Meth.3 (2023) 12 L. FOPPIANO et al.

-
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

-

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

-
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

- + 305-0044
-
Disclosure statement

No potential conflict of interest was reported by the author(s).

-
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

-
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing.LF and POS discussed the ML results and experiments.LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface.LF designed the user interface experiment with KT, TT and WS as curators.KT led the materials-science work on the data with CS, TT and WS.KT, TA, YT and MI revised the paper.YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1.Timetable recording the time spent for each of the 15 articles.Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR).Duration is expressed in minutes.

- Automatic extraction of materials and properties from superconductors scientific literature + Automatic extraction of materials and properties from superconductors scientific literature LFoppiano @@ -304,7 +308,7 @@ 10.1080/27660400.2022.2153633 - Sci Technol Adv Mater + Sci Technol Adv Mater 3 1 @@ -316,7 +320,7 @@ - Materials discovery with machine learning and knowledge discovery + Materials discovery with machine learning and knowledge discovery ONOliveira @@ -326,7 +330,7 @@ 10.3389/fchem.2022.930369 - Front Chem + Front Chem 10 10 @@ -337,7 +341,7 @@ - Commentary: the materials project: a materials genome approach to accelerating materials innovation + Commentary: the materials project: a materials genome approach to accelerating materials innovation AJain @@ -350,7 +354,7 @@ 10.1063/1.4812323 - APL Mater + APL Mater 1 1 @@ -362,7 +366,7 @@ - Aflow: an automatic framework for high-throughput materials discovery + Aflow: an automatic framework for high-throughput materials discovery SCurtarolo @@ -372,13 +376,13 @@ GLHart - + - Comput Mater Sci + Comput Mater Sci 58 - + 2012 @@ -386,7 +390,7 @@ - The nomad laboratory: from data sharing to artificial intelligence + The nomad laboratory: from data sharing to artificial intelligence CDraxl @@ -396,7 +400,7 @@ 10.1088/2515-7639/ab13bb - J Phys Mater + J Phys Mater 2 3 @@ -408,18 +412,18 @@ - Global publication productivity in materials science research: a scientometric analysis + Global publication productivity in materials science research: a scientometric analysis TPratheepan - + - Indian J Inf Sources Serv + Indian J Inf Sources Serv 9 1 - + 2019 Feb @@ -427,7 +431,7 @@ - The PAULING FILE project and materials platform for data science: from big data toward materials genome + The PAULING FILE project and materials platform for data science: from big data toward materials genome EBlokhin @@ -438,7 +442,7 @@ 2018 Springer International Publishing - + Cham @@ -446,7 +450,7 @@ - Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases MIshii @@ -456,7 +460,7 @@ 10.1080/27660400.2023.2223051 - Sci Technol Adv Mater + Sci Technol Adv Mater 3 1 @@ -468,7 +472,7 @@ - Predicting new superconductors and their critical temperatures using machine learning + Predicting new superconductors and their critical temperatures using machine learning BRoter @@ -478,7 +482,7 @@ 10.1016/j.physc.2020.1353689 - Phys C + Phys C 575 1353689 @@ -489,7 +493,7 @@ - Machine learning modeling of superconducting critical temperature + Machine learning modeling of superconducting critical temperature VStanev @@ -502,7 +506,7 @@ 10.1038/s41524-018-0085-8 - Npj Comput Mater + Npj Comput Mater 4 1 @@ -514,7 +518,7 @@ - Machine-learning approach for discovery of conventional superconductors + Machine-learning approach for discovery of conventional superconductors HTran @@ -529,7 +533,7 @@ - Deep learning model for finding new superconductors + Deep learning model for finding new superconductors TKonno @@ -542,7 +546,7 @@ 10.1103/PhysRevB.103.014509 - Phys Rev B + Phys Rev B 103 1 @@ -554,7 +558,7 @@ - The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation JCKlie @@ -564,21 +568,21 @@ BBoullosa - + - Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
2018 - +
- Doccano: text annotation tool for human + Doccano: text annotation tool for human HNakayama @@ -588,10 +592,10 @@ JKamura - + - Software + Software 2018 @@ -600,7 +604,7 @@ - Python materials genomics pymatgen: a robust open-source python library for materials analysis + Python materials genomics pymatgen: a robust open-source python library for materials analysis SPOng @@ -613,11 +617,11 @@ 10.1016/j.commatsci.2012.10.028 - Comput Mater Sci + Comput Mater Sci 68 2 - + 2013 @@ -625,7 +629,7 @@ - Text-mined dataset of inorganic materials synthesis recipes. Sci Data + Text-mined dataset of inorganic materials synthesis recipes. Sci Data OKononova @@ -637,7 +641,7 @@ 10.1038/s41597-019-0224-1 41597-019-0224-1 - + 2019 Oct 6 @@ -648,7 +652,7 @@ - Label studio: data labeling software; 2020-2022 + Label studio: data labeling software; 2020-2022 MTkachenko @@ -658,17 +662,17 @@ AHolmanyuk - + - Open source software + Open source software - Supermat: construction of a linked annotated dataset from superconductors-related publications + Supermat: construction of a linked annotated dataset from superconductors-related publications LFoppiano @@ -681,11 +685,11 @@ 10.1080/27660400.2021.1918396 - Sci Technol Adv Mater: Methods + Sci Technol Adv Mater: Methods 1 1 - + 2021 @@ -693,7 +697,7 @@ - SciBERT: a pretrained language model for scientific text + SciBERT: a pretrained language model for scientific text IBeltagy @@ -703,15 +707,15 @@ ACohan - + - Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
Association for Computational Linguistics Nov. 2019 - +
@@ -719,10 +723,10 @@ - <ptr target="https://github.com/kermitt2/delft"/> + <ptr target="https://github.com/kermitt2/delft" /> </analytic> <monogr> - <title level="j" coords="11,331.03,444.75,110.24,8.78">DeLFT contributors. Delft + DeLFT contributors. Delft 2018-2023 @@ -731,7 +735,7 @@ - Overcoming catastrophic forgetting in neural networks + Overcoming catastrophic forgetting in neural networks JKirkpatrick @@ -742,10 +746,10 @@ NCRabinowitz abs/1612.00796 - + - CoRr + CoRr 2016 @@ -761,7 +765,7 @@ Grobid - + 2008 -2023 @@ -772,4 +776,4 @@
- \ No newline at end of file + diff --git a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml b/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml deleted file mode 100644 index b65cb98..0000000 --- a/resources/samples/Semi-automatic staging area for high-quality structured data extraction from scientific literature.tei.xml +++ /dev/null @@ -1,760 +0,0 @@ - - - - - - Science and Technology of Advanced Materials: Methods - - Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) - - - MEXT - - - unknown - - - - - - - - 14 Dec 2023. - - - - - - LucaFoppiano - 0000-0002-6114-6164 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
- - Knowledge and Data Engineering - Centre for Computational Sciences - University of Tsukuba -
- Tsukuba - Japan; -
-
-
- - TomoyaMato - 0000-0002-0918-6468 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
-
- - KenseiTerashima - 0000-0003-0375-3043 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - PedroOrtiz Suarez - 0000-0003-0343-8852 - - GmbH DFKI - CONTACT Luca Foppiano -
- Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii - 0000-0003-0375 - Berlin - DE -
-
-
- - TakuTou - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - ChikakoSakai - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - Wei-ShengWang - 0009-0001-3572-5736 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - ToshiyukiAmagasa - 0000-0003-0595-2230 - - Knowledge and Data Engineering - Centre for Computational Sciences - University of Tsukuba -
- Tsukuba - Japan; -
-
-
- - YoshihikoTakano - 0000-0002-1541-6928 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - MasashiIshii - ishii.masashi@nims.go.jp - 0000-0003-0357-2832 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
-
- - Masashi - - Science and Technology of Advanced Materials: Methods -
- - Print - - 14 Dec 2023. - - - DCB0425EE18794E34CC3A3075E3E3975 - 10.1080/27660400.2023.2286219 - Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023 -
-
-
- - - - GROBID - A machine learning software for extracting information from scholarly documents - - - - - - - - Materials informatics - superconductors - machine learning - database - TDM - - - -

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

-
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

-
-
-
- - -
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

-
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

-
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

-
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

-
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

-
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

-
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

-
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

-
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

-
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

-
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

-
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

-
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

-
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

-
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

-
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

-
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

-
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

-
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

-
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

-
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
-
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
-
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
-
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
-
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
-
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
-
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
-
-
c classification: The temperature is not correctly classified
-
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

-
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
-
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
-
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

- - - -
-
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

-
-
-

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

-
-
-
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

-
- - - 305-0044 - - -
-
Disclosure statement

No potential conflict of interest was reported by the author(s).

-
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

-
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

-
- - - - - - Automatic extraction of materials and properties from superconductors scientific literature - - LFoppiano - - - PBCastro - - - POSuarez - - 10.1080/27660400.2022.2153633 - - - Sci Technol Adv Mater - - 3 - 1 - 2153633 - 2023 - - - - - - - Materials discovery with machine learning and knowledge discovery - - ONOliveira - - - MJOliveira - - 10.3389/fchem.2022.930369 - - - Front Chem - - 10 - 10 - 2022 - - - - - - - Commentary: the materials project: a materials genome approach to accelerating materials innovation - - AJain - - - SPOng - - - GHautier - - 10.1063/1.4812323 - - - APL Mater - - 1 - 1 - 11002 - 2013 - - - - - - - Aflow: an automatic framework for high-throughput materials discovery - - SCurtarolo - - - WSetyawan - - - GLHart - - - - - Comput Mater Sci - - 58 - - 2012 - - - - - - - The nomad laboratory: from data sharing to artificial intelligence - - CDraxl - - - MScheffler - - 10.1088/2515-7639/ab13bb - - - J Phys Mater - - 2 - 3 - 36001 - 2019 - - - - - - - Global publication productivity in materials science research: a scientometric analysis - - TPratheepan - - - - - Indian J Inf Sources Serv - - 9 - 1 - - 2019 Feb - - - - - - - The PAULING FILE project and materials platform for data science: from big data toward materials genome - - EBlokhin - - - PVillars - - 10.1007/978-3-319-42913-7_62-1 - - 2018 - Springer International Publishing - - Cham - - - - - - - Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases - - MIshii - - - KSakamoto - - 10.1080/27660400.2023.2223051 - - - Sci Technol Adv Mater - - 3 - 1 - 2223051 - 2023 - - - - - - - Predicting new superconductors and their critical temperatures using machine learning - - BRoter - - - SDordevic - - 10.1016/j.physc.2020.1353689 - - - Phys C - - 575 - 1353689 - 2020 - - - - - - - Machine learning modeling of superconducting critical temperature - - VStanev - - - COses - - - AKusne - - 10.1038/s41524-018-0085-8 - - - Npj Comput Mater - - 4 - 1 - 4 - 2017 - - - - - - - Machine-learning approach for discovery of conventional superconductors - - HTran - - - TNVu - - arXiv:221103265. 2022 - - - arXiv preprint - - - - - Deep learning model for finding new superconductors - - TKonno - - - HKurokawa - - - FNabeshima - - 10.1103/PhysRevB.103.014509 - - - Phys Rev B - - 103 - 1 - 14509 - 2021 - - - - - - - The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation - - JCKlie - - - MBugert - - - BBoullosa - - - - - Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations - the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
- - 2018 - - -
-
- - - - Doccano: text annotation tool for human - - HNakayama - - - TKubo - - - JKamura - - - - - Software - - 2018 - - - - - - - Python materials genomics pymatgen: a robust open-source python library for materials analysis - - SPOng - - - WDRichards - - - AJain - - 10.1016/j.commatsci.2012.10.028 - - - Comput Mater Sci - - 68 - 2 - - 2013 - - - - - - - Text-mined dataset of inorganic materials synthesis recipes. Sci Data - - OKononova - - - HHuo - - - THe - - 10.1038/s41597-019-0224-1 - 41597-019-0224-1 - - - 2019 Oct - 6 - 203 - - - - - - - Label studio: data labeling software; 2020-2022 - - MTkachenko - - - MMalyuk - - - AHolmanyuk - - - - - Open source software - - - - - - - Supermat: construction of a linked annotated dataset from superconductors-related publications - - LFoppiano - - - SDieb - - - ASuzuki - - 10.1080/27660400.2021.1918396 - - - Sci Technol Adv Mater: Methods - - 1 - 1 - - 2021 - - - - - - - SciBERT: a pretrained language model for scientific text - - IBeltagy - - - KLo - - - ACohan - - - - - Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing - the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
- - Association for Computational Linguistics - Nov. 2019 - - -
-
- - - - - <ptr target="https://github.com/kermitt2/delft"/> - </analytic> - <monogr> - <title level="j">DeLFT contributors. Delft - - 2018-2023 - - - - - - - Overcoming catastrophic forgetting in neural networks - - JKirkpatrick - - - RPascanu - - - NCRabinowitz - - abs/1612.00796 - - - - CoRr - - 2016 - - - - - - - - <author> - <persName><forename type="first">G</forename><surname>Contributors</surname></persName> - </author> - <author> - <persName><surname>Grobid</surname></persName> - </author> - <ptr target="https://github.com/kermitt2/grobid"/> - <imprint> - <date type="published" when="2008">2008 -2023</date> - </imprint> - </monogr> -</biblStruct> - - </listBibl> - </div> - </back> - </text> -</TEI> \ No newline at end of file From 91fe70d9ff1f13c1c5cf98b26c5ef7bce1def2bd Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 21 May 2024 14:59:31 +0900 Subject: [PATCH 29/46] add sequence (sentence, paragraph) identifier in each mention --- .../java/org/grobid/core/data/Dataset.java | 58 ++- .../grobid/core/data/DatasetComponent.java | 13 +- .../core/data/DatasetDocumentSequence.java | 101 ++++ .../grobid/core/engines/DatasetParser.java | 457 ++++++++++-------- .../grobid/core/utilities/XMLUtilities.java | 3 + 5 files changed, 428 insertions(+), 204 deletions(-) create mode 100644 src/main/java/org/grobid/core/data/DatasetDocumentSequence.java diff --git a/src/main/java/org/grobid/core/data/Dataset.java b/src/main/java/org/grobid/core/data/Dataset.java index 952ff14..250494f 100644 --- a/src/main/java/org/grobid/core/data/Dataset.java +++ b/src/main/java/org/grobid/core/data/Dataset.java @@ -1,5 +1,8 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.assertj.core.util.Strings; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.OffsetPosition; @@ -120,6 +123,8 @@ public String getName() { // a flag to indicate if the entity is located in the Data Availability section private boolean inDataAvailabilitySection = false; + private final List<String> sequenceIdentifiers = new ArrayList<>(); + public Dataset(DatasetType type) { this.type = type; } @@ -130,6 +135,11 @@ public Dataset(DatasetType type, String rawForm) { this.normalizedForm = normalizeRawForm(rawForm); } + public Dataset(DatasetType type, String rawForm, List<String> sequenceIdentifiers) { + this(type, rawForm); + this.sequenceIdentifiers.addAll(sequenceIdentifiers); + } + public DatasetType getType() { return this.type; } @@ -141,7 +151,7 @@ public void setType(DatasetType type) { public String getRawForm() { return rawForm; } - + public void setRawForm(String raw) { this.rawForm = raw; this.normalizedForm = normalizeRawForm(raw); @@ -150,7 +160,7 @@ public void setRawForm(String raw) { public String getNormalizedForm() { return normalizedForm; } - + public void setNormalizedForm(String normalized) { this.normalizedForm = normalizeRawForm(normalized); } @@ -162,7 +172,7 @@ else if (dataset != null) return dataset.getOffsets(); return null; } - + public int getOffsetStart() { if (datasetName != null) return datasetName.getOffsetStart(); @@ -178,15 +188,16 @@ else if (dataset != null) return dataset.getOffsetEnd(); return -1; } - + public double getConf() { return this.conf; } - + public void setConf(double conf) { this.conf = conf; } - + + /*public List<BoundingBox> getBoundingBoxes() { return boundingBoxes; } @@ -194,15 +205,15 @@ public void setConf(double conf) { public void setBoundingBoxes(List<BoundingBox> boundingBoxes) { this.boundingBoxes = boundingBoxes; } - + public List<LayoutToken> getTokens() { return this.tokens; } - + public void setTokens(List<LayoutToken> tokens) { this.tokens = tokens; } - + public TaggingLabel getLabel() { return label; } @@ -210,7 +221,6 @@ public TaggingLabel getLabel() { public void setLabel(TaggingLabel label) { this.label = label; }*/ - public String getLang() { return this.lang; } @@ -225,7 +235,7 @@ public boolean isFiltered() { public void setFiltered(boolean filtered) { this.filtered = filtered; - } + } public void setContext(String context) { this.context = context; @@ -258,7 +268,7 @@ public void setParagraph(String paragraph) { public String getParagraph() { return this.paragraph; } - + public List<BiblioComponent> getBibRefs() { return this.bibRefs; } @@ -290,6 +300,10 @@ public void setDataset(DatasetComponent dataset) { this.dataset = dataset; } + public List<String> getSequenceIdentifiers() { + return sequenceIdentifiers; + } + public DatasetComponent getDataDevice() { return this.dataDevice; } @@ -448,7 +462,7 @@ public String toJson() { buffer.append(", \"inDataAvailabilitySection\" : true"); } - if (context != null && context.length()>0) { + if (StringUtils.isNotBlank(context)) { encoded = encoder.quoteAsUTF8(context.replace("\n", " ").replace(" ", " ")); output = new String(encoded); try { @@ -463,7 +477,7 @@ public String toJson() { }*/ } - if (paragraph != null && paragraph.length()>0) { + if (StringUtils.isNotBlank(paragraph)) { if (paragraphContextOffset != -1) { buffer.append(", \"contextOffset\": " + paragraphContextOffset); } @@ -482,6 +496,22 @@ public String toJson() { }*/ } + if (CollectionUtils.isNotEmpty(sequenceIdentifiers)) { + try{ + String identifiers = Strings.join(sequenceIdentifiers).with(","); + encoded = encoder.quoteAsUTF8(identifiers); + output = new String(encoded); + buffer.append(", \"sequenceIds\": [ " + mapper.writeValueAsString(output) +" ]"); + } catch (JsonProcessingException e) { + logger.warn("could not serialize in JSON the normalized form: " + type.getName()); + } + /*try { + buffer.append(", \"paragraph\": \"" + mapper.writeValueAsString(paragraph.replace("\n", " ").replace(" ", " ")) + "\""); + } catch (JsonProcessingException e) { + logger.warn("could not serialize in JSON the paragraph context: " + paragraph); + }*/ + } + //buffer.append(", \"conf\" : \"" + conf + "\""); /*if ( (boundingBoxes != null) && (boundingBoxes.size() > 0) ) { diff --git a/src/main/java/org/grobid/core/data/DatasetComponent.java b/src/main/java/org/grobid/core/data/DatasetComponent.java index e923799..ea0a7a7 100644 --- a/src/main/java/org/grobid/core/data/DatasetComponent.java +++ b/src/main/java/org/grobid/core/data/DatasetComponent.java @@ -11,6 +11,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; @@ -61,6 +62,9 @@ public class DatasetComponent extends KnowledgeEntity implements Comparable<Data protected String bestDataType = null; protected double hasDatasetScore = 0.0; + // identifier of the sequence (paragraph, sentence) where the mention is occurring + private final List<String> sequenceIdentifiers = new ArrayList<>(); + public DatasetComponent() { this.offsets = new OffsetPosition(); } @@ -352,5 +356,12 @@ private static String normalizeRawForm(String raw) { result = TextUtilities.cleanField(result, false); return result; } - + + public void addSequenceId(String id) { + sequenceIdentifiers.add(id); + } + + public List<String> getSequenceIdentifiers() { + return sequenceIdentifiers; + } } diff --git a/src/main/java/org/grobid/core/data/DatasetDocumentSequence.java b/src/main/java/org/grobid/core/data/DatasetDocumentSequence.java new file mode 100644 index 0000000..6323b9a --- /dev/null +++ b/src/main/java/org/grobid/core/data/DatasetDocumentSequence.java @@ -0,0 +1,101 @@ +package org.grobid.core.data; + +import org.apache.commons.lang3.tuple.Triple; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.utilities.OffsetPosition; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class represent a block in the document, which contains a sentence text, + * sentence Layouttoken list and their id + */ + +public class DatasetDocumentSequence { + private String text; + private List<LayoutToken> tokens; + + // The sentence identifier is loaded here, we need them so that each mention can be easily located in the + // document's TEI + private String id; + + private boolean relevantSectionsNamedDatasets = false; + private boolean relevantSectionsImplicitDatasets = false; + + // The references callout are loaded here, so that we can recover the position in the text + // we need target, text value, and position (character related) + Map<String, Triple<OffsetPosition, String, String>> references = new HashMap<>(); + + public DatasetDocumentSequence(List<LayoutToken> layoutTokens) { + this.tokens = layoutTokens; + } + + public DatasetDocumentSequence(DatasetDocumentSequence block) { + this(block.getText(), block.getTokens(), block.getId()); + } + + public DatasetDocumentSequence(String text, List<LayoutToken> tokens, String id) { + this(text, tokens); + this.id = id; + } + + public DatasetDocumentSequence(String text, List<LayoutToken> tokens) { + this.text = text; + this.tokens = tokens; + } + + public DatasetDocumentSequence(String text, String id) { + this.text = text; + this.id = id; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List<LayoutToken> getTokens() { + return tokens; + } + + public void setTokens(List<LayoutToken> tokens) { + this.tokens = tokens; + } + + public boolean isRelevantSectionsNamedDatasets() { + return relevantSectionsNamedDatasets; + } + + public void setRelevantSectionsNamedDatasets(boolean relevantSectionsNamedDatasets) { + this.relevantSectionsNamedDatasets = relevantSectionsNamedDatasets; + } + + public boolean isRelevantSectionsImplicitDatasets() { + return relevantSectionsImplicitDatasets; + } + + public void setRelevantSectionsImplicitDatasets(boolean relevantSectionsImplicitDatasets) { + this.relevantSectionsImplicitDatasets = relevantSectionsImplicitDatasets; + } + + public Map<String, Triple<OffsetPosition, String, String>> getReferences() { + return references; + } + + public void setReferences(Map<String, Triple<OffsetPosition, String, String>> references) { + this.references = references; + } +} diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index cb6de01..0305e80 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -37,7 +37,6 @@ import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.*; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.utilities.counters.impl.CntManagerFactory; import org.slf4j.Logger; @@ -45,6 +44,7 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import javax.xml.crypto.Data; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; @@ -71,7 +71,6 @@ public class DatasetParser extends AbstractParser { private static volatile DatasetParser instance; - private DatastetLexicon datastetLexicon = null; private EngineParsers parsers; private DatastetConfiguration datastetConfiguration; private DataseerClassifier dataseerClassifier; @@ -100,12 +99,16 @@ private DatasetParser(DatastetConfiguration configuration) { GrobidCRFEngine.valueOf(configuration.getModel("datasets").engine.toUpperCase()), configuration.getModel("datasets").delft.architecture); - datastetLexicon = DatastetLexicon.getInstance(); + DatastetLexicon.getInstance(); parsers = new EngineParsers(); datastetConfiguration = configuration; disambiguator = DatasetDisambiguator.getInstance(configuration); } + public List<List<Dataset>> processing(List<DatasetDocumentSequence> tokensList) { + return processing(tokensList, null, false); + } + /** * Sequence labelling of a list of layout tokens for identifying dataset names. * Input corresponds to a list of sentences, each sentence being itself a list of Layout tokens. @@ -113,7 +116,7 @@ private DatasetParser(DatastetConfiguration configuration) { * @param tokensList the list of LayoutTokens sequences to be labeled * @return list of identified Dataset objects. */ - public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, boolean disambiguate) { + public List<List<Dataset>> processing(List<DatasetDocumentSequence> tokensList, boolean disambiguate) { return processing(tokensList, null, disambiguate); } @@ -121,32 +124,35 @@ public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, boolea * Sequence labelling of a list of layout tokens for identifying dataset names. * Input corresponds to a list of sentences, each sentence being itself a list of Layout tokens. * - * @param tokensList the list of LayoutTokens sequences to be labeled - * @param pdfAnnotations the list of PDF annotation objects (URI, GOTO, GOTOR) to better control - * the recognition + * @param datasetDocumentSequences the list of LayoutTokens sequences to be labeled + * @param pdfAnnotations the list of PDF annotation objects (URI, GOTO, GOTOR) to better control + * the recognition * @return list of identified Dataset objects. */ - public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, List<PDFAnnotation> pdfAnnotations, boolean disambiguate) { + public List<List<Dataset>> processing(List<DatasetDocumentSequence> datasetDocumentSequences, List<PDFAnnotation> pdfAnnotations, boolean disambiguate) { List<List<Dataset>> results = new ArrayList<>(); - if (tokensList == null || tokensList.size() == 0) { + if (CollectionUtils.isEmpty(datasetDocumentSequences)) { return results; } StringBuilder input = new StringBuilder(); //List<String> inputs = new ArrayList<>(); - List<List<LayoutToken>> newTokensList = new ArrayList<>(); + List<DatasetDocumentSequence> newTokensList = new ArrayList<>(); int total = 0; int maxTokens = 0; - for (List<LayoutToken> tokens : tokensList) { + for (DatasetDocumentSequence block : datasetDocumentSequences) { + List<LayoutToken> tokens = block.getTokens(); // to be sure it's done, retokenize according to the DatastetAnalyzer tokens = DatastetAnalyzer.getInstance().retokenizeLayoutTokens(tokens); - newTokensList.add(tokens); + DatasetDocumentSequence newBlock = new DatasetDocumentSequence(block); + newBlock.setTokens(tokens); + newTokensList.add(newBlock); // create basic input without features int nbTokens = 0; for (LayoutToken token : tokens) { - if (token.getText().trim().length() == 0) { + if (StringUtils.isBlank(token.getText())) { //System.out.println("skipped: " + token.getText()); continue; } @@ -166,7 +172,7 @@ public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, List<P //System.out.println("total size: " + total); //System.out.println("max token sequence: " + maxTokens); - tokensList = newTokensList; + datasetDocumentSequences = newTokensList; String allRes = null; try { @@ -177,25 +183,34 @@ public List<List<Dataset>> processing(List<List<LayoutToken>> tokensList, List<P "An exception occured while labeling a sequence.", e); } - if (allRes == null || allRes.length() == 0) + if (StringUtils.isBlank(allRes)) { return results; + } String[] resBlocks = allRes.split("\n\n"); //System.out.println("resBlocks: " + resBlocks.length); int i = 0; - for (List<LayoutToken> tokens : tokensList) { + for (DatasetDocumentSequence datasetDocumentSequence : datasetDocumentSequences) { + List<LayoutToken> tokens = datasetDocumentSequence.getTokens(); if (CollectionUtils.isEmpty(tokens)) { results.add(new ArrayList<>()); } else { String text = LayoutTokensUtil.toText(tokens); List<DatasetComponent> localDatasetcomponents = new ArrayList<>(); - localDatasetcomponents = addUrlComponents(tokens, localDatasetcomponents, text, pdfAnnotations); -/*System.out.println("\n" + text); -for(DatasetComponent localDatasetcomponent : localDatasetcomponents) { -System.out.println(localDatasetcomponent.toJson()); -}*/ + if (pdfAnnotations != null) { + localDatasetcomponents = addUrlComponents(tokens, localDatasetcomponents, text, pdfAnnotations); + } + + /*System.out.println("\n" + text); + for(DatasetComponent localDatasetcomponent : localDatasetcomponents) { + System.out.println(localDatasetcomponent.toJson()); + }*/ List<DatasetComponent> bufferLocalDatasetcomponents = resultExtractionLayoutTokens(resBlocks[i], tokens, text); + bufferLocalDatasetcomponents.stream().forEach(datasetComponent -> { + datasetComponent.addSequenceId(datasetDocumentSequence.getId()); + } + ); List<OffsetPosition> localDatasetcomponentOffsets = new ArrayList<>(); for (DatasetComponent localDatasetcomponent : localDatasetcomponents) { localDatasetcomponentOffsets.add(localDatasetcomponent.getOffsets()); @@ -376,14 +391,14 @@ private List<Dataset> groupByEntities(List<DatasetComponent> components, List<La localDataset.setContext(text); localDatasets.add(localDataset); } - localDataset = new Dataset(localComponent.getType(), localComponent.getRawForm()); + localDataset = new Dataset(localComponent.getType(), localComponent.getRawForm(), localComponent.getSequenceIdentifiers()); localDataset.setDatasetName(localComponent); } else if (localComponent.getType() == DatasetType.DATASET) { if (localDataset != null) { localDataset.setContext(text); localDatasets.add(localDataset); } - localDataset = new Dataset(localComponent.getType(), localComponent.getRawForm()); + localDataset = new Dataset(localComponent.getType(), localComponent.getRawForm(), localComponent.getSequenceIdentifiers()); localDataset.setDataset(localComponent); } else if (localComponent.getType() == DatasetType.DATA_DEVICE) { if (localDataset != null && localDataset.getDataset() != null) { @@ -406,6 +421,31 @@ else if (localDataset.getDataDevice().getRawForm().length() < localComponent.get return localDatasets; } + private List<DatasetComponent> addUrlComponents(List<DatasetComponent> existingComponents, + DatasetDocumentSequence sequence) { + + Map<String, Triple<OffsetPosition, String, String>> urls = + sequence.getReferences().entrySet().stream() + .filter(entry -> entry.getValue().getRight().equals(URL_TYPE)) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + if (CollectionUtils.isEmpty(urls.keySet())) { + return existingComponents; + } + + for (Map.Entry<String, Triple<OffsetPosition, String, String>> url : urls.entrySet()) { + String textValue = url.getKey(); + Triple<OffsetPosition, String, String> annotation = url.getValue(); + + OffsetPosition position = annotation.getLeft(); + String target = annotation.getMiddle(); + + } + + return existingComponents; + + } + private List<DatasetComponent> addUrlComponents(List<LayoutToken> sentenceTokens, List<DatasetComponent> existingComponents, String text, @@ -494,10 +534,9 @@ private List<DatasetComponent> addUrlComponents(List<LayoutToken> sentenceTokens return existingComponents; } - private List<DatasetComponent> addUrlComponentsAsReferences(List<LayoutToken> sentenceTokens, - List<DatasetComponent> existingComponents, - String text, - Map<String, Triple<OffsetPosition, String, String>> references) { + private List<DatasetComponent> addUrlComponentsAsReferences(DatasetDocumentSequence sequence, + List<DatasetComponent> existingComponents, + Map<String, Triple<OffsetPosition, String, String>> references) { // positions for lexical match List<OffsetPosition> existingPositions = new ArrayList<>(); @@ -511,7 +550,7 @@ private List<DatasetComponent> addUrlComponentsAsReferences(List<LayoutToken> se String target = urlInfos.getMiddle(); // String type = urlInfos.getRight(); - DatasetComponent urlComponent = new DatasetComponent(text.substring(pos.start, pos.end)); + DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end)); urlComponent.setOffsetStart(pos.start); urlComponent.setOffsetEnd(pos.end); if (target != null) { @@ -541,14 +580,15 @@ private List<DatasetComponent> addUrlComponentsAsReferences(List<LayoutToken> se * Sequence labelling of a string for identifying dataset names. */ public List<Dataset> processingString(String input, boolean disambiguate) { - List<List<LayoutToken>> tokensList = new ArrayList<>(); + List<DatasetDocumentSequence> tokensList = new ArrayList<>(); input = UnicodeUtil.normaliseText(input); - tokensList.add(analyzer.tokenizeWithLayoutToken(input)); + tokensList.add(new DatasetDocumentSequence(analyzer.tokenizeWithLayoutToken(input))); List<List<Dataset>> result = processing(tokensList, disambiguate); - if (result != null && result.size() > 0) + if (CollectionUtils.isNotEmpty(result)) { return result.get(0); - else - return new ArrayList<Dataset>(); + } else { + return new ArrayList<>(); + } } private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSentences) { @@ -625,10 +665,12 @@ private List<DataseerResults> classifyWithDataseerClassifier(List<String> allSen } public List<List<Dataset>> processingStrings(List<String> inputs, boolean disambiguate) { - List<List<LayoutToken>> tokensList = new ArrayList<>(); + List<DatasetDocumentSequence> tokensList = new ArrayList<>(); for (String input : inputs) { input = UnicodeUtil.normaliseText(input); - tokensList.add(analyzer.tokenizeWithLayoutToken(input)); + List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(input, tokens); + tokensList.add(datasetDocumentSequence); } return processing(tokensList, disambiguate); } @@ -663,9 +705,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, // segment of interest (e.g. header, body, annex) and possibly apply // the corresponding model to further filter by structure types - List<List<LayoutToken>> selectedLayoutTokenSequences = new ArrayList<>(); - List<Boolean> relevantSectionsNamedDatasets = new ArrayList<>(); - List<Boolean> relevantSectionsImplicitDatasets = new ArrayList<>(); + List<DatasetDocumentSequence> selectedDatasetDocumentSequences = new ArrayList<>(); // the following array stores the index of the sections identified as Data availability statement //List<Integer> sectionsDAS = new ArrayList<>(); @@ -692,25 +732,28 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, // title List<LayoutToken> titleTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_TITLE); if (titleTokens != null) { - selectedLayoutTokenSequences.add(titleTokens); - relevantSectionsNamedDatasets.add(false); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(titleTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(false); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } // abstract List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT); if (abstractTokens != null) { - selectedLayoutTokenSequences.add(abstractTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(abstractTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } // keywords List<LayoutToken> keywordTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_KEYWORD); if (keywordTokens != null) { - selectedLayoutTokenSequences.add(keywordTokens); - relevantSectionsNamedDatasets.add(false); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(keywordTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(false); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } } } @@ -729,7 +772,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, LayoutTokenization tokenizationBody = featSeg.getRight(); String rese = null; - if ((bodytext != null) && (bodytext.trim().length() > 0)) { + if (StringUtils.isNotBlank(bodytext)) { rese = parsers.getFullTextParser().label(bodytext); } else { LOGGER.debug("Fulltext model: The input to the sequence labelling processing is empty"); @@ -756,13 +799,16 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, if (curParagraphTokens == null) curParagraphTokens = new ArrayList<>(); //curParagraphTokens.addAll(localTokenization); - } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) || clusterLabel.equals(TaggingLabels.ITEM)) { + } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) + || clusterLabel.equals(TaggingLabels.ITEM)) { //|| clusterLabel.equals(TaggingLabels.SECTION) { - if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { + if (lastClusterLabel == null || curParagraphTokens == null + || isNewParagraph(lastClusterLabel)) { if (curParagraphTokens != null) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(true); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } curParagraphTokens = new ArrayList<>(); } @@ -781,9 +827,10 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, } // last paragraph if (curParagraphTokens != null) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(true); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } } } @@ -804,7 +851,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, LayoutTokenization tokenizationBody = featSeg.getRight(); String rese = null; - if ((bodytext != null) && (bodytext.trim().length() > 0)) { + if (StringUtils.isNotBlank(bodytext)) { rese = parsers.getFullTextParser().label(bodytext); } else { LOGGER.debug("Fulltext model applied to Annex: The input to the sequence labelling processing is empty"); @@ -826,7 +873,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, String clusterText = LayoutTokensUtil.toText(cluster.concatTokens()); List<LayoutToken> localTokenization = cluster.concatTokens(); - if ((localTokenization == null) || (localTokenization.size() == 0)) + if (CollectionUtils.isNotEmpty(localTokenization)) continue; if (TEIFormatter.MARKER_LABELS.contains(clusterLabel)) { @@ -839,13 +886,15 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH) || clusterLabel.equals(TaggingLabels.ITEM)) { if (lastClusterLabel == null || curParagraphTokens == null || isNewParagraph(lastClusterLabel)) { if (curParagraphTokens != null && previousSection == null) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } else if (curParagraphTokens != null && previousSection.equals("das")) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(true); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } curParagraphTokens = new ArrayList<>(); } @@ -870,13 +919,15 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, } // last paragraph if (curParagraphTokens != null && currentSection == null) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } else if (curParagraphTokens != null && currentSection.equals("das")) { - selectedLayoutTokenSequences.add(curParagraphTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(curParagraphTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(true); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } } } @@ -886,9 +937,10 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, if (documentParts != null) { List<LayoutToken> footnoteTokens = doc.getTokenizationParts(documentParts, doc.getTokenizations()); if (footnoteTokens != null) { - selectedLayoutTokenSequences.add(footnoteTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(footnoteTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(false); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } } @@ -969,8 +1021,8 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, // consolidate the attached ref bib (we don't consolidate all bibliographical references // to avoid useless costly computation) - List<BibDataSet> citationsToConsolidate = new ArrayList<BibDataSet>(); - List<Integer> consolidated = new ArrayList<Integer>(); + List<BibDataSet> citationsToConsolidate = new ArrayList<>(); + List<Integer> consolidated = new ArrayList<>(); for (List<Dataset> datasets : entities) { for (Dataset entity : datasets) { if (entity.getBibRefs() != null && entity.getBibRefs().size() > 0) { @@ -1032,28 +1084,36 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, if (CollectionUtils.isNotEmpty(availabilityTokens)) { entities = markDAS(entities, availabilityTokens); } - selectedLayoutTokenSequences.add(availabilityTokens); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + DatasetDocumentSequence datasetDocumentSequence = new DatasetDocumentSequence(availabilityTokens); + datasetDocumentSequence.setRelevantSectionsNamedDatasets(true); + datasetDocumentSequence.setRelevantSectionsImplicitDatasets(true); + selectedDatasetDocumentSequences.add(datasetDocumentSequence); } } // segment zone into sentences - List<List<LayoutToken>> allLayoutTokens = new ArrayList<>(); + List<DatasetDocumentSequence> allDatasetDocumentSequences = new ArrayList<>(); List<String> allSentences = new ArrayList<>(); List<Integer> sentenceOffsetStarts = new ArrayList<>(); int zoneIndex = 0; int accumulatedOffset = 0; Map<Integer, Integer> mapSentencesToZones = new HashMap<>(); - for (List<LayoutToken> layoutTokens : selectedLayoutTokenSequences) { + for (DatasetDocumentSequence sequence : selectedDatasetDocumentSequences) { + List<LayoutToken> layoutTokens = sequence.getTokens(); + + // To be sure we should add the sequence identifiers + + String sequenceId = "_" + KeyGen.getKey().substring(0, 7); + sequence.setId(sequenceId); + layoutTokens = DatastetAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens); - if ((layoutTokens == null) || (layoutTokens.size() == 0)) { + if (CollectionUtils.isEmpty(layoutTokens)) { //allLayoutTokens.add(null); //allSentences.add(null); List<LayoutToken> dummyLayoutTokens = new ArrayList<>(); dummyLayoutTokens.add(new LayoutToken("dummy")); - allLayoutTokens.add(dummyLayoutTokens); + allDatasetDocumentSequences.add(new DatasetDocumentSequence(dummyLayoutTokens)); //System.out.println("dummy sentence at " + (allSentences.size())); allSentences.add("dummy"); sentenceOffsetStarts.add(accumulatedOffset); @@ -1087,7 +1147,9 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, pos += token.getText().length(); } - allLayoutTokens.add(sentenceTokens); + // We need to generate IDs for each sentence + sequenceId = "_" + KeyGen.getKey().substring(0, 7); + allDatasetDocumentSequences.add(new DatasetDocumentSequence(localText.substring(startPos, endPos), sentenceTokens, sequenceId)); allSentences.add(localText.substring(startPos, endPos)); mapSentencesToZones.put(allSentences.size() - 1, zoneIndex); sentenceOffsetStarts.add(accumulatedOffset + startPos); @@ -1100,7 +1162,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, //System.out.println("sentenceOffsetStarts size: " + sentenceOffsetStarts.size()); // pre-process labeling of every sentences in batch - processLayoutTokenSequences(allLayoutTokens, entities, sentenceOffsetStarts, pdfAnnotations, disambiguate); + processLayoutTokenSequences(allDatasetDocumentSequences, entities, sentenceOffsetStarts, pdfAnnotations, disambiguate); //System.out.println("entities size: " + entities.size()); //System.out.println("mapSentencesToZones size: " + mapSentencesToZones.size()); @@ -1158,8 +1220,8 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, int index = 0; List<List<Dataset>> newEntities = new ArrayList<>(); - for (List<LayoutToken> sentenceTokens : allLayoutTokens) { - List<Dataset> localEntities = propagateLayoutTokenSequence(sentenceTokens, + for (DatasetDocumentSequence sequence : allDatasetDocumentSequences) { + List<Dataset> localEntities = propagateLayoutTokenSequence(sequence, entities.get(index), termProfiles, termPattern, @@ -1170,7 +1232,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, Collections.sort(localEntities); // revisit and attach URL component - localEntities = attachUrlComponents(localEntities, sentenceTokens, allSentences.get(index), pdfAnnotations); + localEntities = attachUrlComponents(localEntities, sequence.getTokens(), allSentences.get(index), pdfAnnotations); } newEntities.add(localEntities); @@ -1213,12 +1275,12 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, } if (localDataset.getType() == DatasetType.DATASET && - !relevantSectionsImplicitDatasets.get(currentZone) && !referenceDataSource) { + !selectedDatasetDocumentSequences.get(currentZone).isRelevantSectionsImplicitDatasets() && !referenceDataSource) { continue; } if (localDataset.getType() == DatasetType.DATASET_NAME && - !relevantSectionsNamedDatasets.get(currentZone)) { + !selectedDatasetDocumentSequences.get(currentZone).isRelevantSectionsNamedDatasets()) { continue; } @@ -1522,12 +1584,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do boolean disambiguate, boolean addParagraphContext) { - List<String> selectedSequences = new ArrayList<>(); - //The references callout are loaded here, so that we can recover the position in the text - // we need target, text value, and position (character related) - List<Map<String, Triple<OffsetPosition, String, String>>> selectedSequencesReferences = new ArrayList<>(); - List<Boolean> relevantSectionsNamedDatasets = new ArrayList<>(); - List<Boolean> relevantSectionsImplicitDatasets = new ArrayList<>(); + List<DatasetDocumentSequence> selectedSequences = new ArrayList<>(); //Extract relevant section from the TEI // Title, abstract, keywords @@ -1542,12 +1599,13 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do doc, XPathConstants.NODE); if (titleNode == null) { - LOGGER.warn("Title was not found, skipping."); + LOGGER.warn("The title was not found in the TEI, skipping."); } else { String textTitle = titleNode.getTextContent(); - selectedSequences.add(textTitle); - relevantSectionsNamedDatasets.add(false); - relevantSectionsImplicitDatasets.add(false); + String titleId = ((org.w3c.dom.Element) titleNode).getAttribute("xml:id"); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(textTitle, titleId); + localSequence.setRelevantSectionsNamedDatasets(false); + localSequence.setRelevantSectionsImplicitDatasets(false); } } catch (XPathExpressionException e) { @@ -1564,13 +1622,17 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < abstractNodeList.getLength(); i++) { org.w3c.dom.Node item = abstractNodeList.item(i); String text = item.getTextContent(); - selectedSequences.add(text); // Capture URLs if available - //LF Not clear why true, just copied from around ProcessPDF:578 - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + + //LF: Not clear why true, just copied from around ProcessPDF:578 + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(false); + selectedSequences.add(localSequence); + } } catch (XPathExpressionException e) { @@ -1586,10 +1648,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < keywordsNodeList.getLength(); i++) { org.w3c.dom.Node item = keywordsNodeList.item(i); + String keyword = item.getTextContent(); - selectedSequences.add(keyword); - relevantSectionsNamedDatasets.add(false); - relevantSectionsImplicitDatasets.add(false); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(keyword, itemId); + + //LF: Not clear why true, just copied from around ProcessPDF:578 + localSequence.setRelevantSectionsNamedDatasets(false); + localSequence.setRelevantSectionsImplicitDatasets(false); + selectedSequences.add(localSequence); } } catch (XPathExpressionException e) { @@ -1599,10 +1667,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Fill up the references to match the current sentence/paragraphs - for (String seq : selectedSequences) { - selectedSequencesReferences.add(new HashMap<>()); - } - // Extraction from Body try { String expression = segmentSentences ? "//text/body/div/p" : "//text/body/div/p/s"; @@ -1613,16 +1677,19 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < bodyNodeList.getLength(); i++) { org.w3c.dom.Node item = bodyNodeList.item(i); String text = item.getTextContent(); - selectedSequences.add(text); - // Capture URLs if available + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); //LF Not clear why true, just copied from around ProcessPDF:635 - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(true); + selectedSequences.add(localSequence); + + // Capture URLs if available Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); - selectedSequencesReferences.add(referencesInText); + localSequence.setReferences(referencesInText); } } catch (XPathExpressionException e) { @@ -1632,29 +1699,29 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Various statements (acknowledgement, funding, data availability) - // funding and acknowledgement at the moment have only paragraphs (Grobid issue # - List<String> sectionTypesOnlyParagraphs = Arrays.asList("acknowledgement", "funding"); - - for (String sectionType : sectionTypesOnlyParagraphs) { - try { - String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; - org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, - doc, - XPathConstants.NODESET); - for (int i = 0; i < annexNodeList.getLength(); i++) { - org.w3c.dom.Node item = annexNodeList.item(i); - String text = item.getTextContent(); - selectedSequences.add(text); - selectedSequencesReferences.add(new HashMap<>()); - relevantSectionsNamedDatasets.add(false); - relevantSectionsImplicitDatasets.add(false); - } - - } catch (XPathExpressionException e) { - // Ignore exception - LOGGER.warn("Abstract was not found, skipping."); - } - } +// // funding and acknowledgement at the moment have only paragraphs (Grobid issue # +// List<String> sectionTypesOnlyParagraphs = Arrays.asList("acknowledgement", "funding"); +// +// for (String sectionType : sectionTypesOnlyParagraphs) { +// try { +// String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; +// org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, +// doc, +// XPathConstants.NODESET); +// for (int i = 0; i < annexNodeList.getLength(); i++) { +// org.w3c.dom.Node item = annexNodeList.item(i); +// String text = item.getTextContent(); +// selectedSequences.add(text); +// selectedSequencesReferences.add(new HashMap<>()); +// relevantSectionsNamedDatasets.add(false); +// relevantSectionsImplicitDatasets.add(false); +// } +// +// } catch (XPathExpressionException e) { +// // Ignore exception +// LOGGER.warn("Abstract was not found, skipping."); +// } +// } // Annex might contain misclassified relevant sections try { @@ -1683,17 +1750,19 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate("//*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); for (int j = 0; j < textsAnnex.getLength(); j++) { org.w3c.dom.Node paragraphAnnex = textsAnnex.item(j); - String paragraph = paragraphAnnex.getTextContent(); - selectedSequences.add(paragraph); - selectedSequencesReferences.add(new HashMap<>()); + String text = paragraphAnnex.getTextContent(); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + + selectedSequences.add(localSequence); if (StringUtils.equals(currentSection, "das")) { - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(true); } else { - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(false); } } } @@ -1703,13 +1772,13 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do LOGGER.warn("Annex was not found, skipping."); } - // availability statement have sentences + // specific section types statement DatastetAnalyzer datastetAnalyzer = DatastetAnalyzer.getInstance(); - List<String> sectionTypesAlsoSentences = Arrays.asList("availability"); + List<String> specificSectionTypesAnnex = Arrays.asList("availability", "acknowledgement", "funding"); - List<LayoutToken> availabilityTokens = new ArrayList<>(); - for (String sectionType : sectionTypesAlsoSentences) { + List<DatasetDocumentSequence> availabilitySequences = new ArrayList<>(); + for (String sectionType : specificSectionTypesAnnex) { try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; expression = segmentSentences ? expression + "/*[local-name() = 's']" : ""; @@ -1719,11 +1788,13 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < annexNodeList.getLength(); i++) { org.w3c.dom.Node item = annexNodeList.item(i); String text = item.getTextContent(); - selectedSequences.add(text); - selectedSequencesReferences.add(new HashMap<>()); - availabilityTokens.addAll(analyzer.tokenizeWithLayoutToken(text)); - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(true); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, analyzer.tokenizeWithLayoutToken(text), itemId); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(true); + selectedSequences.add(localSequence); + availabilitySequences.add(localSequence); } } catch (XPathExpressionException e) { @@ -1741,12 +1812,17 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do XPathConstants.NODESET); for (int i = 0; i < bodyNodeList.getLength(); i++) { org.w3c.dom.Node item = bodyNodeList.item(i); + String text = item.getTextContent(); - selectedSequences.add(text); - selectedSequencesReferences.add(new HashMap<>()); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + //LF Not clear why true, just copied from around ProcessPDF:635 - relevantSectionsNamedDatasets.add(true); - relevantSectionsImplicitDatasets.add(false); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(false); + selectedSequences.add(localSequence); + availabilitySequences.add(localSequence); } } catch (XPathExpressionException e) { @@ -1785,12 +1861,13 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do List<BiblioComponent> bibRefComponents = new ArrayList<>(); Map<String, BiblioItem> biblioRefMap = new HashMap<>(); - List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequencesReferences.stream() + List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream() + .map(DatasetDocumentSequence::getReferences) .filter(map -> map.values().stream() .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) .toList(); - for(Map<String, Triple<OffsetPosition, String, String>> ref :referencesList) { + for (Map<String, Triple<OffsetPosition, String, String>> ref : referencesList) { for (String refText : ref.keySet()) { Triple<OffsetPosition, String, String> infos = ref.get(refText); @@ -1813,19 +1890,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } } - //Dataset Recognition + // Dataset Recognition List<List<Dataset>> entities = new ArrayList<>(); - //TODO: Add sentence segmentation. if sentenceSegmentation is false, we need to perform it now - - List<List<LayoutToken>> selectedSequencesLayoutTokens = new ArrayList<>(); List<LayoutToken> allDocumentTokens = new ArrayList<>(); int startingOffset = 0; List<Integer> sentenceOffsetStarts = new ArrayList<>(); - for (String sequence : selectedSequences) { - List<LayoutToken> sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence); - selectedSequencesLayoutTokens.add(sentenceTokens); + for (DatasetDocumentSequence sequence : selectedSequences) { + List<LayoutToken> sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence.getText()); + sequence.setTokens(sentenceTokens); int finalStartingOffset = startingOffset; List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream() .map(lt -> { @@ -1836,10 +1910,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do allDocumentTokens.addAll(sentenceTokenAllTokens); sentenceOffsetStarts.add(startingOffset); - startingOffset += sequence.length(); + startingOffset += sequence.getText().length(); } - List<List<Dataset>> datasetLists = processing(selectedSequencesLayoutTokens, new ArrayList<>(), false); + List<List<Dataset>> datasetLists = processing(selectedSequences, false); entities.addAll(datasetLists); @@ -1856,7 +1930,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } // TODO make sure that selectedSequences == allSentences above in the processPDF? - List<DataseerResults> dataseerClassificationResults = classifyWithDataseerClassifier(selectedSequences); + List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList(); + List<DataseerResults> dataseerClassificationResults = classifyWithDataseerClassifier(allSentences); for (int i = 0; i < entities.size(); i++) { List<Dataset> localDatasets = entities.get(i); @@ -1888,20 +1963,23 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do List<List<OffsetPosition>> placeTaken = preparePlaceTaken(entities); List<List<Dataset>> newEntities = new ArrayList<>(); - for (int i = 0; i < selectedSequencesReferences.size(); i++) { - List<LayoutToken> sentenceTokens = selectedSequencesLayoutTokens.get(i); - List<Dataset> localEntities = propagateLayoutTokenSequence(sentenceTokens, + for (int i = 0; i < selectedSequences.size(); i++) { + + DatasetDocumentSequence selectedSequence = selectedSequences.get(i); + List<Dataset> localEntities = propagateLayoutTokenSequence( + selectedSequence, entities.get(i), termProfiles, termPattern, placeTaken.get(i), frequencies, - sentenceOffsetStarts.get(i)); + sentenceOffsetStarts.get(i) + ); if (localEntities != null) { Collections.sort(localEntities); // revisit and attach URL component - localEntities = attachUrlComponents(localEntities, sentenceTokens, selectedSequences.get(i), selectedSequencesReferences.get(i)); + localEntities = attachUrlComponents(localEntities, selectedSequence); } newEntities.add(localEntities); @@ -1922,12 +2000,12 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } if (localDataset.getType() == DatasetType.DATASET && - !relevantSectionsImplicitDatasets.get(i) && !referenceDataSource) { + !selectedSequences.get(i).isRelevantSectionsImplicitDatasets() && !referenceDataSource) { continue; } if (localDataset.getType() == DatasetType.DATASET_NAME && - !relevantSectionsNamedDatasets.get(i)) { + !selectedSequences.get(i).isRelevantSectionsNamedDatasets()) { continue; } @@ -2015,7 +2093,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } // mark datasets present in Data Availability section(s) - if (CollectionUtils.isNotEmpty(availabilityTokens)) { + if (CollectionUtils.isNotEmpty(availabilitySequences)) { + List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList(); entities = markDAS(entities, availabilityTokens); } @@ -2044,7 +2123,7 @@ public static String getXPathWithoutNamespaces(String s) { /** * Process with the dataset model a set of arbitrary sequence of LayoutTokenization */ - private List<List<Dataset>> processLayoutTokenSequences(List<List<LayoutToken>> layoutTokenList, + private List<List<Dataset>> processLayoutTokenSequences(List<DatasetDocumentSequence> layoutTokenList, List<List<Dataset>> entities, List<Integer> sentenceOffsetStarts, List<PDFAnnotation> pdfAnnotations, @@ -2296,7 +2375,7 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) { FastMatcher termPattern = new FastMatcher(); List<String> added = new ArrayList<>(); for (List<Dataset> datasets : entities) { - if (CollectionUtils.isEmpty(datasets)){ + if (CollectionUtils.isEmpty(datasets)) { continue; } @@ -2367,7 +2446,7 @@ public FastMatcher prepareTermPattern(List<List<Dataset>> entities) { public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, List<LayoutToken> tokens) { Map<String, Integer> frequencies = new TreeMap<String, Integer>(); for (List<Dataset> datasets : entities) { - if (CollectionUtils.isEmpty(datasets)){ + if (CollectionUtils.isEmpty(datasets)) { continue; } for (Dataset entity : datasets) { @@ -2391,7 +2470,7 @@ public Map<String, Integer> prepareFrequencies(List<List<Dataset>> entities, Lis return frequencies; } - public List<Dataset> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens, + public List<Dataset> propagateLayoutTokenSequence(DatasetDocumentSequence sequence, List<Dataset> entities, Map<String, Double> termProfiles, FastMatcher termPattern, @@ -2399,6 +2478,7 @@ public List<Dataset> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens Map<String, Integer> frequencies, int sentenceOffsetStart) { + List<LayoutToken> layoutTokens = sequence.getTokens(); List<OffsetPosition> results = termPattern.matchLayoutToken(layoutTokens, true, true); // above: do not ignore delimiters and case sensitive matching @@ -2460,6 +2540,7 @@ public List<Dataset> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens name.setLabel(DatasetTaggingLabels.DATASET_NAME); name.setType(DatasetType.DATASET_NAME); name.setTokens(matchedTokens); + name.addSequenceId(sequence.getId()); List<BoundingBox> boundingBoxes = BoundingBoxCalculator.calculate(matchedTokens); name.setBoundingBoxes(boundingBoxes); @@ -2467,6 +2548,7 @@ public List<Dataset> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens Dataset entity = new Dataset(DatasetType.DATASET_NAME, name.getRawForm()); entity.setDatasetName(name); entity.setContext(localText); + entity.getSequenceIdentifiers().addAll(name.getSequenceIdentifiers()); //entity.setType(DatastetLexicon.Dataset_Type.DATASET); entity.setPropagated(true); entity.setGlobalContextOffset(sentenceOffsetStart); @@ -2670,17 +2752,14 @@ public List<Dataset> attachUrlComponents(List<Dataset> datasets, return datasets; } - public List<Dataset> attachUrlComponents(List<Dataset> datasets, - List<LayoutToken> tokens, - String sentenceString, - Map<String, Triple<OffsetPosition, String, String>> references) { + public List<Dataset> attachUrlComponents(List<Dataset> datasets, DatasetDocumentSequence sequence) { // revisit url including propagated dataset names - if (datasets == null || datasets.size() == 0) { + if (CollectionUtils.isEmpty(datasets)) { return datasets; } // Filter references only of type URLs - Map<String, Triple<OffsetPosition, String, String>> onlyURLs = references.entrySet().stream() + Map<String, Triple<OffsetPosition, String, String>> onlyURLs = sequence.getReferences().entrySet().stream() .filter(entry -> { Triple<OffsetPosition, String, String> triple = entry.getValue(); return triple.getRight().equals(URL_TYPE); @@ -2721,7 +2800,7 @@ public List<Dataset> attachUrlComponents(List<Dataset> datasets, Collections.sort(localDatasetcomponents); int sizeBefore = localDatasetcomponents.size(); - localDatasetcomponents = addUrlComponentsAsReferences(tokens, localDatasetcomponents, sentenceString, references); + localDatasetcomponents = addUrlComponentsAsReferences(sequence, localDatasetcomponents, onlyURLs); // attach URL to the closest dataset while (localDatasetcomponents.size() - sizeBefore > 0) { diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 19a46fe..9532bcf 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -147,6 +147,9 @@ public static String getTextNoRefMarkers(Element element) { return found ? buf.toString() : null; } + /** + * @return Pair with text or null on the left and a Triple with (position, target and type) + */ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> getTextNoRefMarkersAndMarkerPositions(Element element, int globalPos) { StringBuffer buf = new StringBuffer(); NodeList nodeChildren = element.getChildNodes(); From cc1cd2a54c894dc1985e5a897e23a5289364360d Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 21 May 2024 15:33:45 +0900 Subject: [PATCH 30/46] Fix sentence switch --- src/main/java/org/grobid/core/engines/DatasetParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 0305e80..d9907ec 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1590,7 +1590,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Title, abstract, keywords // If we process the TEI, at this point the document should be already segmented correctly. - boolean segmentSentences = true; + boolean segmentSentences = false; XPath xPath = XPathFactory.newInstance().newXPath(); From c58502ef5047704e29419ca92de900b38e5ee583 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Fri, 24 May 2024 07:24:30 +0900 Subject: [PATCH 31/46] Fix incorrect xpath on children --- src/main/java/org/grobid/core/engines/DatasetParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index d9907ec..2a25bc9 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1747,7 +1747,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } } String granularity = segmentSentences ? "p" : "s"; - org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate("//*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); + org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate("./*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); for (int j = 0; j < textsAnnex.getLength(); j++) { org.w3c.dom.Node paragraphAnnex = textsAnnex.item(j); From 6977bdabc8cc4015c407e429984fe90aa2392a91 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 4 Jun 2024 16:48:33 +1200 Subject: [PATCH 32/46] Cleanup text when extracting from XML, normalise unicode character, remove duplicate spaces --- .../grobid/core/engines/DatasetParser.java | 79 +++++++++++++++---- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 2a25bc9..0b96908 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1601,9 +1601,11 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do if (titleNode == null) { LOGGER.warn("The title was not found in the TEI, skipping."); } else { - String textTitle = titleNode.getTextContent(); + String text = titleNode.getTextContent(); + String normalizedText = normalize(text); + String titleId = ((org.w3c.dom.Element) titleNode).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(textTitle, titleId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId); localSequence.setRelevantSectionsNamedDatasets(false); localSequence.setRelevantSectionsImplicitDatasets(false); } @@ -1622,11 +1624,11 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < abstractNodeList.getLength(); i++) { org.w3c.dom.Node item = abstractNodeList.item(i); String text = item.getTextContent(); + String normalizedText = normalize(text); // Capture URLs if available - String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, itemId); //LF: Not clear why true, just copied from around ProcessPDF:578 localSequence.setRelevantSectionsNamedDatasets(true); @@ -1651,8 +1653,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do String keyword = item.getTextContent(); String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(keyword, itemId); + String normalizedKeyword = normalize(keyword); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedKeyword, itemId); //LF: Not clear why true, just copied from around ProcessPDF:578 localSequence.setRelevantSectionsNamedDatasets(false); @@ -1677,9 +1679,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < bodyNodeList.getLength(); i++) { org.w3c.dom.Node item = bodyNodeList.item(i); String text = item.getTextContent(); + String normalizedText = normalize(text); String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, itemId); //LF Not clear why true, just copied from around ProcessPDF:635 localSequence.setRelevantSectionsNamedDatasets(true); @@ -1736,11 +1739,12 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do String currentSection = null; org.w3c.dom.Node head = (org.w3c.dom.Node) xPath.evaluate("./*[local-name() = 'head']", item, XPathConstants.NODE); if (head != null) { - String headText = head.getTextContent(); + String text = head.getTextContent(); + String normalizedText = normalize(text); - if (checkDASAnnex(headText)) { + if (checkDASAnnex(normalizedText)) { currentSection = "das"; - } else if (checkAuthorAnnex(headText) || checkAbbreviationAnnex(headText)) { + } else if (checkAuthorAnnex(normalizedText) || checkAbbreviationAnnex(normalizedText)) { currentSection = "author"; } else { currentSection = null; @@ -1752,8 +1756,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do org.w3c.dom.Node paragraphAnnex = textsAnnex.item(j); String text = paragraphAnnex.getTextContent(); + String normalizedText = normalize(text); String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, itemId); selectedSequences.add(localSequence); @@ -1788,9 +1793,11 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (int i = 0; i < annexNodeList.getLength(); i++) { org.w3c.dom.Node item = annexNodeList.item(i); String text = item.getTextContent(); + String normalizedText = normalize(text); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, analyzer.tokenizeWithLayoutToken(text), itemId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, analyzer.tokenizeWithLayoutToken(text), itemId); localSequence.setRelevantSectionsNamedDatasets(true); localSequence.setRelevantSectionsImplicitDatasets(true); selectedSequences.add(localSequence); @@ -1799,7 +1806,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } catch (XPathExpressionException e) { // Ignore exception - LOGGER.warn("Availability statement was not found, skipping."); + LOGGER.warn(sectionType + " statement was not found, skipping."); } } @@ -1814,9 +1821,11 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do org.w3c.dom.Node item = bodyNodeList.item(i); String text = item.getTextContent(); + String normalizedText = normalize(text); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); - DatasetDocumentSequence localSequence = new DatasetDocumentSequence(text, itemId); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, itemId); //LF Not clear why true, just copied from around ProcessPDF:635 localSequence.setRelevantSectionsNamedDatasets(true); @@ -1845,7 +1854,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do org.w3c.dom.Node attribute = item.getAttributes().item(a); if (attribute.getNodeName().equals("xml:id")) { String referenceText = item.getTextContent(); - String cleanedRawReferenceText = referenceText.replaceAll("\\s", " ").strip().replaceAll("[ ]{2,}", ", "); + String normalizedReferenceText = normalize(referenceText); + String cleanedRawReferenceText = normalizedReferenceText.replaceAll("\\p{Space}+", " ").strip().replaceAll("[ ]{2,}", ", "); referenceMap.put(attribute.getNodeValue(), Pair.of(cleanedRawReferenceText, item)); } } @@ -1900,6 +1910,30 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (DatasetDocumentSequence sequence : selectedSequences) { List<LayoutToken> sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence.getText()); sequence.setTokens(sentenceTokens); +// +// // Normalization +// List<LayoutToken> sentenceTokensNormalized = sentenceTokens.stream() +// .map(layoutToken -> { +// layoutToken.setText(UnicodeUtil.normaliseText(layoutToken.getText())); +// +// return layoutToken; +// } +// ).toList(); +// // Adjust offsets +// +// // Correcting offsets after having removed certain tokens +// IntStream +// .range(1, sentenceTokensNormalized.size()) +// .forEach(i -> { +// int expectedFollowingOffset = sentenceTokensNormalized.get(i - 1).getOffset() +// + StringUtils.length(sentenceTokensNormalized.get(i - 1).getText()); +// +// if (expectedFollowingOffset != sentenceTokensNormalized.get(i).getOffset()) { +// LOGGER.trace("Correcting offsets " + i + " from " + sentenceTokensNormalized.get(i).getOffset() + " to " + expectedFollowingOffset); +// sentenceTokensNormalized.get(i).setOffset(expectedFollowingOffset); +// } +// }); +// int finalStartingOffset = startingOffset; List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream() .map(lt -> { @@ -2106,6 +2140,17 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do return Pair.of(entities, resCitations); } + private static String normalize(String text) { + String normalizedText = UnicodeUtil.normaliseText(text); + normalizedText = normalizedText.replace("\n", " "); + normalizedText = normalizedText.replace("\t", " "); + normalizedText = normalizedText.replace(" ", " "); + // the last one is a special "large" space missed by the regex "\\p{Space}+" below + normalizedText = normalizedText.replaceAll("\\p{Space}+", " "); + + return normalizedText; + } + public static String getXPathWithoutNamespaces(String s) { StringBuilder sb = new StringBuilder(); for (String item : s.split("/")) { @@ -2123,12 +2168,12 @@ public static String getXPathWithoutNamespaces(String s) { /** * Process with the dataset model a set of arbitrary sequence of LayoutTokenization */ - private List<List<Dataset>> processLayoutTokenSequences(List<DatasetDocumentSequence> layoutTokenList, + private List<List<Dataset>> processLayoutTokenSequences(List<DatasetDocumentSequence> documentSequenceList, List<List<Dataset>> entities, List<Integer> sentenceOffsetStarts, List<PDFAnnotation> pdfAnnotations, boolean disambiguate) { - List<List<Dataset>> results = processing(layoutTokenList, pdfAnnotations, disambiguate); + List<List<Dataset>> results = processing(documentSequenceList, pdfAnnotations, disambiguate); entities.addAll(results); int i = 0; From cc0114022d1bc7efee3406cfbf811f1b03e07569 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 4 Jun 2024 17:10:10 +1200 Subject: [PATCH 33/46] Fix bug in the xpaths that were used wrongly to select sentences or paragraphs --- .../grobid/core/engines/DatasetParser.java | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 0b96908..6239434 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -208,7 +208,7 @@ public List<List<Dataset>> processing(List<DatasetDocumentSequence> datasetDocum }*/ List<DatasetComponent> bufferLocalDatasetcomponents = resultExtractionLayoutTokens(resBlocks[i], tokens, text); bufferLocalDatasetcomponents.stream().forEach(datasetComponent -> { - datasetComponent.addSequenceId(datasetDocumentSequence.getId()); + datasetComponent.addSequenceId(datasetDocumentSequence.getId()); } ); List<OffsetPosition> localDatasetcomponentOffsets = new ArrayList<>(); @@ -1546,6 +1546,7 @@ public String processXML(File file) throws Exception { /** * Process dataset mentions from a TEI XML string + * segmentSentences, indicate that it needs to segment sentences */ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(String documentAsString, boolean segmentSentences, @@ -1590,7 +1591,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Title, abstract, keywords // If we process the TEI, at this point the document should be already segmented correctly. - boolean segmentSentences = false; + boolean extractParagraphs = false; XPath xPath = XPathFactory.newInstance().newXPath(); @@ -1616,7 +1617,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } try { - String expression = segmentSentences ? "//abstract/div/p" : "//abstract/div/p/s"; + String expression = extractParagraphs ? "//abstract/div/p" : "//abstract/div/p/s"; String expressionNoNamespaces = getXPathWithoutNamespaces(expression); org.w3c.dom.NodeList abstractNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, doc, @@ -1671,7 +1672,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Extraction from Body try { - String expression = segmentSentences ? "//text/body/div/p" : "//text/body/div/p/s"; + String expression = extractParagraphs ? "//text/body/div/p" : "//text/body/div/p/s"; String expressionNoNamespaces = getXPathWithoutNamespaces(expression); org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, doc, @@ -1750,7 +1751,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do currentSection = null; } } - String granularity = segmentSentences ? "p" : "s"; + String granularity = extractParagraphs ? "p" : "s"; org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate("./*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); for (int j = 0; j < textsAnnex.getLength(); j++) { org.w3c.dom.Node paragraphAnnex = textsAnnex.item(j); @@ -1786,7 +1787,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (String sectionType : specificSectionTypesAnnex) { try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; - expression = segmentSentences ? expression + "/*[local-name() = 's']" : ""; + expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, doc, XPathConstants.NODESET); @@ -1810,10 +1811,38 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } } + // Look into any div in the back that have no type, in case something is hidden there (e.g. availability statements) + + try { + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or not(contains('" + String.join("|", specificSectionTypesAnnex) + "', concat('|', @type, '|')))]/*[local-name()='div']/*[local-name() = 'p']"; + expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; + org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); + for (int i = 0; i < annexNodeList.getLength(); i++) { + org.w3c.dom.Node item = annexNodeList.item(i); + String text = item.getTextContent(); + String normalizedText = normalize(text); + + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, analyzer.tokenizeWithLayoutToken(text), itemId); + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(true); + selectedSequences.add(localSequence); + availabilitySequences.add(localSequence); + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Generic statement in the back was not found, skipping."); + } + + //Footnotes try { String expression = "//*[local-name() = 'text']/*[local-name() = 'body']/*[local-name() = 'note'][@*[local-name()='place' and .='foot']]/*[local-name() = 'div']/*[local-name() = 'p']"; - expression = segmentSentences ? expression + "/*[local-name() = 's']" : ""; + expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, doc, XPathConstants.NODESET); From 3c3af44ce50d1563b4ef6f31cbd9aed232fa66a6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 4 Jun 2024 17:40:44 +1200 Subject: [PATCH 34/46] Try to get possible sections in the <back> in which the das is hidden benith --- .../grobid/core/engines/DatasetParser.java | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 6239434..075a1fc 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1752,7 +1752,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } } String granularity = extractParagraphs ? "p" : "s"; - org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate("./*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); + org.w3c.dom.NodeList textsAnnex = (org.w3c.dom.NodeList) xPath.evaluate(".//*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); for (int j = 0; j < textsAnnex.getLength(); j++) { org.w3c.dom.Node paragraphAnnex = textsAnnex.item(j); @@ -1812,6 +1812,56 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } // Look into any div in the back that have no type, in case something is hidden there (e.g. availability statements) + try { + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type)]"; + org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + doc, + XPathConstants.NODESET); + for (int i = 0; i < nodeList.getLength(); i++) { + org.w3c.dom.Node item = nodeList.item(i); + + // Check the head? + String currentSection = null; + org.w3c.dom.Node head = (org.w3c.dom.Node) xPath.evaluate("./*[local-name() = 'head']", item, XPathConstants.NODE); + if (head != null) { + String text = head.getTextContent(); + String normalizedText = normalize(text); + + if (checkDASAnnex(normalizedText)) { + currentSection = "das"; + } else if (checkAuthorAnnex(normalizedText) || checkAbbreviationAnnex(normalizedText)) { + currentSection = "author"; + } else { + currentSection = null; + } + } + String granularity = extractParagraphs ? "p" : "s"; + org.w3c.dom.NodeList textGeneralSections = (org.w3c.dom.NodeList) xPath.evaluate(".//*[local-name() = '" + granularity + "']", item, XPathConstants.NODESET); + for (int j = 0; j < textGeneralSections.getLength(); j++) { + org.w3c.dom.Node paragraphAnnex = textGeneralSections.item(j); + + String text = paragraphAnnex.getTextContent(); + String normalizedText = normalize(text); + String itemId = ((org.w3c.dom.Element) item).getAttribute("xml:id"); + DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, itemId); + + selectedSequences.add(localSequence); + + if (StringUtils.equals(currentSection, "das")) { + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(true); + } else { + localSequence.setRelevantSectionsNamedDatasets(true); + localSequence.setRelevantSectionsImplicitDatasets(false); + } + } + } + + } catch (XPathExpressionException e) { + // Ignore exception + LOGGER.warn("Generic sections in the <back> (without type) was not found, skipping."); + } + try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or not(contains('" + String.join("|", specificSectionTypesAnnex) + "', concat('|', @type, '|')))]/*[local-name()='div']/*[local-name() = 'p']"; From 7b6fe062aca0c4a99ae1c678fb946c853e467f57 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Sat, 14 Sep 2024 18:08:19 +0200 Subject: [PATCH 35/46] update to grobid 0.8.1, and catch up other changes --- Dockerfile.datastet | 2 +- build.gradle | 4 +-- .../core/analyzers/DatastetAnalyzer.java | 5 ++++ .../core/engines/DataseerClassifier.java | 28 ++++++++----------- .../grobid/core/engines/DatasetParser.java | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/Dockerfile.datastet b/Dockerfile.datastet index 6814850..e28f647 100644 --- a/Dockerfile.datastet +++ b/Dockerfile.datastet @@ -49,7 +49,7 @@ RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip && \ # build runtime image # ------------------- -FROM lfoppiano/grobid:0.8.0-full-slim as runtime +FROM lfoppiano/grobid:0.8.1-full as runtime # setting locale is likely useless but to be sure ENV LANG C.UTF-8 diff --git a/build.gradle b/build.gradle index 5243eac..b1d6e18 100644 --- a/build.gradle +++ b/build.gradle @@ -118,8 +118,8 @@ dependencies { implementation 'org.apache.opennlp:opennlp-tools:1.9.1' //Grobid - implementation group: 'org.grobid', name: 'grobid-core', version: '0.8.0' - implementation group: 'org.grobid', name: 'grobid-trainer', version: '0.8.0' + implementation group: 'org.grobid', name: 'grobid-core', version: '0.8.1' + implementation group: 'org.grobid', name: 'grobid-trainer', version: '0.8.1' //Tests testImplementation group: 'junit', name: 'junit', version: '4.12' diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java index 8fc06a8..b089324 100644 --- a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java +++ b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java @@ -88,6 +88,11 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) { return result; } + @Override + public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) { + throw new UnsupportedOperationException("Method retokenizeFromLayoutToken not yet implemented"); + } + public List<String> retokenize(List<String> chunks) { List<String> result = new ArrayList<>(); for (String chunk : chunks) { diff --git a/src/main/java/org/grobid/core/engines/DataseerClassifier.java b/src/main/java/org/grobid/core/engines/DataseerClassifier.java index 011b169..9a32848 100644 --- a/src/main/java/org/grobid/core/engines/DataseerClassifier.java +++ b/src/main/java/org/grobid/core/engines/DataseerClassifier.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; @@ -124,7 +125,7 @@ public DatastetConfiguration getDatastetConfiguration() { public String classify(String text) throws Exception { if (StringUtils.isEmpty(text)) return null; - List<String> texts = new ArrayList<String>(); + List<String> texts = new ArrayList<>(); texts.add(text); return classify(texts); } @@ -158,14 +159,15 @@ public String classifyFirstLevel(String text) throws Exception { * @return JSON string */ public String classify(List<String> texts) throws Exception { - if (texts == null || texts.size() == 0) + + if (CollectionUtils.isEmpty(texts)) return null; logger.info("classify: " + texts.size() + " sentence(s)"); ObjectMapper mapper = new ObjectMapper(); String the_json = classifierBinary.classify(texts); // first pass to select texts to be cascaded to next level - List<String> cascaded_texts = new ArrayList<String>(); + List<String> cascaded_texts = new ArrayList<>(); JsonNode root = null; if (the_json != null && the_json.length() > 0) { root = mapper.readTree(the_json); @@ -399,7 +401,7 @@ public String prettyPrintJsonString(String json, ObjectMapper mapper) { * Enrich a TEI document with Dataseer information * @return enriched TEI string */ - public String processTEIString(String xmlString) throws Exception { + public String processTEIString(String xmlString, boolean segmentSentences) throws Exception { String tei = null; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -407,10 +409,8 @@ public String processTEIString(String xmlString) throws Exception { DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(xmlString))); //document.getDocumentElement().normalize(); - tei = processTEIDocument(document, false); - } catch(ParserConfigurationException e) { - e.printStackTrace(); - } catch(IOException e) { + tei = processTEIDocument(document, segmentSentences); + } catch(ParserConfigurationException | IOException e) { e.printStackTrace(); } return tei; @@ -437,9 +437,7 @@ public String processTEI(String filePath, boolean segmentSentences, boolean avoi if (avoidDomParserBug) tei = restoreDomParserAttributeBug(tei); - } catch(ParserConfigurationException e) { - e.printStackTrace(); - } catch(IOException e) { + } catch(ParserConfigurationException | IOException e) { e.printStackTrace(); } return tei; @@ -494,9 +492,7 @@ public String processJATS(String filePath) throws Exception { //if (avoidDomParserBug) // tei = restoreDomParserAttributeBug(tei); - } catch(ParserConfigurationException e) { - e.printStackTrace(); - } catch(IOException e) { + } catch(ParserConfigurationException | IOException e) { e.printStackTrace(); } finally { if (newFilePath != null) { @@ -1054,7 +1050,7 @@ public String restoreDomParserAttributeBug(String xml) { * @return enriched TEI string */ public String processPDF(String filePath) throws Exception { - // convert PDF into structured TEI thanks to GROBID0 + // convert PDF into structured TEI thanks to GROBID List<String> coordinates = new ArrayList<>(); coordinates.add("s"); @@ -1067,7 +1063,7 @@ public String processPDF(String filePath) throws Exception { .generateTeiCoordinates(coordinates) .build(); String tei = engine.fullTextToTEI(new File(filePath), config); - return processTEIString(tei); + return processTEIString(tei, false); } } diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 075a1fc..217e26a 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1672,7 +1672,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Extraction from Body try { - String expression = extractParagraphs ? "//text/body/div/p" : "//text/body/div/p/s"; + String expression = extractParagraphs ? "//text/body//div/p" : "//text/body//div/p/s"; String expressionNoNamespaces = getXPathWithoutNamespaces(expression); org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expressionNoNamespaces, doc, From 2162720ad8d265a9e753a422d1ba66936324039f Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Sun, 13 Oct 2024 10:18:05 +0200 Subject: [PATCH 36/46] retrieve URLs from the TEI XML in all the sections that are of interest (cherry picked from commit da6746ce99a250eb86460e9f8bfdd0ac587124fb) --- .../org/grobid/core/engines/DatasetParser.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 217e26a..e62cb34 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1590,7 +1590,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do //Extract relevant section from the TEI // Title, abstract, keywords - // If we process the TEI, at this point the document should be already segmented correctly. + // TODO: remove this If we process the TEI, at this point the document should be already segmented correctly. boolean extractParagraphs = false; XPath xPath = XPathFactory.newInstance().newXPath(); @@ -1770,6 +1770,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsNamedDatasets(true); localSequence.setRelevantSectionsImplicitDatasets(false); } + + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight(); + localSequence.setReferences(referencesInText); } } @@ -1803,6 +1806,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsImplicitDatasets(true); selectedSequences.add(localSequence); availabilitySequences.add(localSequence); + + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); + localSequence.setReferences(referencesInText); } } catch (XPathExpressionException e) { @@ -1854,6 +1860,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsNamedDatasets(true); localSequence.setRelevantSectionsImplicitDatasets(false); } + + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) paragraphAnnex, 0).getRight(); + localSequence.setReferences(referencesInText); } } @@ -1881,6 +1890,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsImplicitDatasets(true); selectedSequences.add(localSequence); availabilitySequences.add(localSequence); + + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); + localSequence.setReferences(referencesInText); } } catch (XPathExpressionException e) { @@ -1911,6 +1923,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsImplicitDatasets(false); selectedSequences.add(localSequence); availabilitySequences.add(localSequence); + + Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); + localSequence.setReferences(referencesInText); } } catch (XPathExpressionException e) { From a2b5bbb5a805a8dc5753861e70509c965fe060e2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Sun, 13 Oct 2024 11:38:40 +0200 Subject: [PATCH 37/46] update github actions --- .github/workflows/ci-build-manual.yml | 37 ++++++++++++++++++++++----- .github/workflows/ci-build.yml | 19 +++++++++----- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml index 293580f..2fdfb9e 100644 --- a/.github/workflows/ci-build-manual.yml +++ b/.github/workflows/ci-build-manual.yml @@ -1,8 +1,13 @@ name: Build and push a development version on docker -on: - workflow_dispatch: - +on: + workflow_dispatch: + inputs: + custom_tag: + type: string + description: Docker image tag + required: true + default: "latest-develop" jobs: build: @@ -25,8 +30,25 @@ jobs: steps: - name: Create more disk space - run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - uses: actions/checkout@v2 + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /opt/hostedtoolcache + sudo rm -rf /opt/google/chrome + sudo rm -rf /opt/microsoft/msedge + sudo rm -rf /opt/microsoft/powershell + sudo rm -rf /opt/pipx + sudo rm -rf /usr/lib/mono + sudo rm -rf /usr/local/julia* + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/lib/node_modules + sudo rm -rf /usr/local/share/chromium + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/share/swift + - uses: actions/checkout@v4 - name: Build and push id: docker_build uses: mr-smithers-excellent/docker-build-push@v6 @@ -37,6 +59,7 @@ jobs: image: lfoppiano/datastet registry: docker.io pushImage: true - tags: latest-develop + tags: | + latest-develop, ${{ github.event.inputs.custom_tag}} - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 18f9a31..d9c7621 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -36,13 +36,6 @@ jobs: runs-on: ubuntu-latest steps: -# - name: Maximize build disk space -# uses: easimon/maximize-build-space@v10 -# with: -# remove-dotnet: 'true' -# remove-haskell: 'true' -# remove-codeql: 'true' -# remove-android: 'true' - name: Create more disk space run: | sudo rm -rf /usr/share/dotnet @@ -50,6 +43,18 @@ jobs: sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo rm -rf /opt/hostedtoolcache + sudo rm -rf /opt/google/chrome + sudo rm -rf /opt/microsoft/msedge + sudo rm -rf /opt/microsoft/powershell + sudo rm -rf /opt/pipx + sudo rm -rf /usr/lib/mono + sudo rm -rf /usr/local/julia* + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/lib/node_modules + sudo rm -rf /usr/local/share/chromium + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/share/swift - uses: actions/checkout@v4 - name: Build and push id: docker_build From e3a48909deec3b3ff2fe3bbeaf2b21663eeed866 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Sun, 13 Oct 2024 12:24:52 +0200 Subject: [PATCH 38/46] fix xpath to fall back into div into TEI/back (cherry picked from commit 920323fa0cb5cf5dc1bfab1aed2df90918065b3d) --- src/main/java/org/grobid/core/engines/DatasetParser.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index e62cb34..4d6a4da 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1873,7 +1873,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do try { - String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or not(contains('" + String.join("|", specificSectionTypesAnnex) + "', concat('|', @type, '|')))]/*[local-name()='div']/*[local-name() = 'p']"; +// String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (not(contains(@type, 'availability')) and not(contains(@type, 'acknowledgement')) and not(contains(@type, 'funding')))]/*[local-name()='div']/*[local-name() = 'p']"; + + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type-> "not(contains(@type, '"+type+"'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, doc, From 371f520d14020a6434a8559aa35f29ed9067124a Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Mon, 14 Oct 2024 01:53:19 +0200 Subject: [PATCH 39/46] cleanup (cherry picked from commit e256ffa0ea2a616a072f005fdd45bc74261436c6) --- src/main/java/org/grobid/core/engines/DatasetParser.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 4d6a4da..87575da 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -952,8 +952,8 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, if (availabilityTokens != null) { // we attach and match bibliographical reference callout TEIFormatter formatter = new TEIFormatter(doc, parsers.getFullTextParser()); // second pass, body - if ((bodyClusters != null) && (resCitations != null) && (resCitations.size() > 0)) { - List<BiblioComponent> bibRefComponents = new ArrayList<BiblioComponent>(); + if (bodyClusters != null && CollectionUtils.isNotEmpty(resCitations)) { + List<BiblioComponent> bibRefComponents = new ArrayList<>(); for (TaggingTokenCluster cluster : bodyClusters) { if (cluster == null) { continue; @@ -1873,8 +1873,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do try { -// String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (not(contains(@type, 'availability')) and not(contains(@type, 'acknowledgement')) and not(contains(@type, 'funding')))]/*[local-name()='div']/*[local-name() = 'p']"; - String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type-> "not(contains(@type, '"+type+"'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, From 1483aab346a46d54258d622b5be7715d233dca33 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Mon, 14 Oct 2024 01:53:30 +0200 Subject: [PATCH 40/46] fix reference mapping (cherry picked from commit c92adb110b0092381836d13cad33d82d0a2c7aaf) --- src/main/java/org/grobid/core/engines/DatasetParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 87575da..ad21a91 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1975,7 +1975,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (String refText : ref.keySet()) { Triple<OffsetPosition, String, String> infos = ref.get(refText); - String target = infos.getRight(); + String target = infos.getMiddle(); OffsetPosition position = infos.getLeft(); Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target); From 4ab67a61b13867d1cb70aa085f6653e98f6f9446 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Mon, 14 Oct 2024 03:18:00 +0200 Subject: [PATCH 41/46] fix references extraction (cherry picked from commit 27194da5c8855cec60104c51e8e9b951eeb420a7) --- .../java/org/grobid/core/data/Dataset.java | 2 +- .../grobid/core/engines/DatasetParser.java | 121 +++++++++++------- .../controller/DatastetProcessFile.java | 34 ++--- 3 files changed, 92 insertions(+), 65 deletions(-) diff --git a/src/main/java/org/grobid/core/data/Dataset.java b/src/main/java/org/grobid/core/data/Dataset.java index 250494f..483f658 100644 --- a/src/main/java/org/grobid/core/data/Dataset.java +++ b/src/main/java/org/grobid/core/data/Dataset.java @@ -279,7 +279,7 @@ public void setBibRefs(List<BiblioComponent> bibRefs) { public void addBibRef(BiblioComponent bibRef) { if (bibRefs == null) { - bibRefs = new ArrayList<BiblioComponent>(); + bibRefs = new ArrayList<>(); } bibRefs.add(bibRef); } diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index ad21a91..94e3381 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1690,8 +1690,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do localSequence.setRelevantSectionsImplicitDatasets(true); selectedSequences.add(localSequence); - // Capture URLs if available - + // Capture URLs and references if available Map<String, Triple<OffsetPosition, String, String>> referencesInText = XMLUtilities.getTextNoRefMarkersAndMarkerPositions((org.w3c.dom.Element) item, 0).getRight(); localSequence.setReferences(referencesInText); } @@ -1873,7 +1872,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do try { - String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type-> "not(contains(@type, '"+type+"'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; + String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][not(@type) or (" + String.join(" and ", specificSectionTypesAnnex.stream().map(type -> "not(contains(@type, '" + type + "'))").collect(Collectors.joining())) + ")]/*[local-name()='div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, doc, @@ -1981,6 +1980,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); + refText = refText.replaceAll("[\\[\\], ]+", ""); + biblioRefMap.put(refText, biblioItem); BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", ""))); biblioComponent.setRawForm(refText); @@ -1999,8 +2000,6 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do List<LayoutToken> allDocumentTokens = new ArrayList<>(); - int startingOffset = 0; - List<Integer> sentenceOffsetStarts = new ArrayList<>(); for (DatasetDocumentSequence sequence : selectedSequences) { List<LayoutToken> sentenceTokens = datastetAnalyzer.tokenizeWithLayoutToken(sequence.getText()); sequence.setTokens(sentenceTokens); @@ -2028,34 +2027,21 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // } // }); // - int finalStartingOffset = startingOffset; - List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream() - .map(lt -> { - lt.setOffset(lt.getOffset() + finalStartingOffset); - return lt; - }) - .collect(Collectors.toList()); +// int finalStartingOffset = startingOffset; +// List<LayoutToken> sentenceTokenAllTokens = sentenceTokens.stream() +// .map(lt -> { +// lt.setOffset(lt.getOffset() + finalStartingOffset); +// return lt; +// }) +// .collect(Collectors.toList()); - allDocumentTokens.addAll(sentenceTokenAllTokens); - sentenceOffsetStarts.add(startingOffset); - startingOffset += sequence.getText().length(); + allDocumentTokens.addAll(sentenceTokens); } - List<List<Dataset>> datasetLists = processing(selectedSequences, false); + List<List<Dataset>> datasetLists = processing(selectedSequences, disambiguate); entities.addAll(datasetLists); - for (int i = 0; i < entities.size(); i++) { - List<Dataset> datasets = entities.get(i); - if (datasets == null) { - continue; - } - for (Dataset dataset : datasets) { - if (dataset == null) - continue; - dataset.setGlobalContextOffset(sentenceOffsetStarts.get(i)); - } - } // TODO make sure that selectedSequences == allSentences above in the processPDF? List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList(); @@ -2101,7 +2087,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do termPattern, placeTaken.get(i), frequencies, - sentenceOffsetStarts.get(i) + 0 +// sentenceOffsetStarts.get(i) ); if (localEntities != null) { Collections.sort(localEntities); @@ -2154,7 +2141,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Enhance information in dataset entities if (CollectionUtils.isNotEmpty(bibRefComponents)) { // attach references to dataset entities - entities = attachRefBib(entities, bibRefComponents); + entities = attachRefBibSimple(entities, bibRefComponents); } // consolidate the attached ref bib (we don't consolidate all bibliographical references @@ -2168,7 +2155,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do for (BiblioComponent bibRef : bibRefs) { Integer refKeyVal = bibRef.getRefKey(); if (!consolidated.contains(refKeyVal)) { - BiblioItem biblioItem = biblioRefMap.get(refKeyVal); + BiblioItem biblioItem = biblioRefMap.get(String.valueOf(refKeyVal)); BibDataSet biblioDataSet = new BibDataSet(); biblioDataSet.setResBib(biblioItem); citationsToConsolidate.add(biblioDataSet); @@ -2179,19 +2166,21 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } } - try { - Consolidation consolidator = Consolidation.getInstance(); - Map<Integer, BiblioItem> resConsolidation = consolidator.consolidate(citationsToConsolidate); - for (int j = 0; j < citationsToConsolidate.size(); j++) { - BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); - BiblioItem bibo = resConsolidation.get(j); - if (bibo != null) { - BiblioItem.correct(resCitation, bibo); + if (StringUtils.isNotBlank(datastetConfiguration.getGluttonHost())) { + try { + Consolidation consolidator = Consolidation.getInstance(); + Map<Integer, BiblioItem> resConsolidation = consolidator.consolidate(citationsToConsolidate); + for (int j = 0; j < citationsToConsolidate.size(); j++) { + BiblioItem resCitation = citationsToConsolidate.get(j).getResBib(); + BiblioItem bibo = resConsolidation.get(j); + if (bibo != null) { + BiblioItem.correct(resCitation, bibo); + } } + } catch (Exception e) { + throw new GrobidException( + "An exception occurred while running consolidation on bibliographical references.", e); } - } catch (Exception e) { - throw new GrobidException( - "An exception occured while running consolidation on bibliographical references.", e); } // propagate the bib. ref. to the entities corresponding to the same dataset name without bib. ref. @@ -2230,8 +2219,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do entities = DatasetContextClassifier.getInstance(datastetConfiguration) .classifyDocumentContexts(entities); - List<BibDataSet> resCitations = List.of(); - return Pair.of(entities, resCitations); + return Pair.of(entities, citationsToConsolidate); } private static String normalize(String text) { @@ -2355,10 +2343,11 @@ public static boolean checkDASAnnex(List<LayoutToken> annexTokens) { return false; } - /** - * Try to attach relevant bib ref component to dataset entities - */ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) { + return attachRefBib(entities, refBibComponents, 5); + } + + public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) { // we anchor the process to the dataset names and aggregate other closest components on the right // if we cross a bib ref component we attach it, if a bib ref component is just after the last @@ -2387,7 +2376,7 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli for (BiblioComponent refBib : refBibComponents) { //System.out.println(refBib.getOffsetStart() + " - " + refBib.getOffsetStart()); if ((refBib.getOffsetStart() >= pos) && - (refBib.getOffsetStart() <= endPos + 5)) { + (refBib.getOffsetStart() <= endPos + distance)) { entity.addBibRef(refBib); endPos = refBib.getOffsetEnd(); } @@ -2398,6 +2387,42 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli return entities; } + /** + * Try to attach relevant bib ref component to dataset entities, this does not use the global offset as in the + * TEI all references' offsets are local to the sentence + */ + public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) { + return attachRefBib(entities, refBibComponents, 5); + } + + public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) { + + // we anchor the process to the dataset names and aggregate other closest components on the right + // if we cross a bib ref component we attach it, if a bib ref component is just after the last + // component of the entity group, we attach it + for (List<Dataset> datasets : entities) { + for (Dataset entity : datasets) { + if (entity.getDatasetName() == null) + continue; + + // find the name component and the offset + DatasetComponent nameComponent = entity.getDatasetName(); + int pos = nameComponent.getOffsetEnd(); + + // find included or just next bib ref callout + List<BiblioComponent> relatedReferences = refBibComponents.stream() + .filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance) + .collect(Collectors.toList()); + + if (CollectionUtils.isNotEmpty(relatedReferences)) { + entity.setBibRefs(relatedReferences); + } + } + } + + return entities; + } + public List<List<OffsetPosition>> preparePlaceTaken(List<List<Dataset>> entities) { List<List<OffsetPosition>> localPositions = new ArrayList<>(); for (List<Dataset> datasets : entities) { @@ -2690,7 +2715,7 @@ public List<Dataset> propagateLayoutTokenSequence(DatasetDocumentSequence sequen entity.getSequenceIdentifiers().addAll(name.getSequenceIdentifiers()); //entity.setType(DatastetLexicon.Dataset_Type.DATASET); entity.setPropagated(true); - entity.setGlobalContextOffset(sentenceOffsetStart); +// entity.setGlobalContextOffset(sentenceOffsetStart); if (entities == null) entities = new ArrayList<>(); entities.add(entity); diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index f32d7ed..6c74566 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.inject.Inject; import com.google.inject.Singleton; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.data.BibDataSet; @@ -338,7 +339,7 @@ public static Response processDatasetJATS(final InputStream inputStream, json.append(", \"md5\": \"" + md5Str + "\""); json.append(", \"mentions\":["); - if (extractedEntities != null && extractedEntities.size()>0) { + if (CollectionUtils.isNotEmpty(extractedEntities)) { boolean startList = true; for(List<Dataset> results : extractedEntities) { for(Dataset dataset : results) { @@ -353,12 +354,12 @@ public static Response processDatasetJATS(final InputStream inputStream, json.append("], \"references\":["); -// if (extractionResult != null) { -// List<BibDataSet> bibDataSet = extractionResult.getRight(); -// if (bibDataSet != null && bibDataSet.size()>0) { -// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); -// } -// } + if (CollectionUtils.isNotEmpty(extractedEntities)) { + List<BibDataSet> bibDataSet = extractionResult.getRight(); + if (CollectionUtils.isNotEmpty(bibDataSet)) { + DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); + } + } json.append("]"); @@ -442,7 +443,7 @@ public static Response processDatasetTEI(final InputStream inputStream, String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); json.append(", \"md5\": \"" + md5Str + "\""); json.append(", \"mentions\":["); - if (extractedEntities != null && extractedEntities.size()>0) { + if (CollectionUtils.isNotEmpty(extractedEntities)) { boolean startList = true; for(List<Dataset> results : extractedEntities) { for(Dataset dataset : results) { @@ -454,14 +455,15 @@ public static Response processDatasetTEI(final InputStream inputStream, } } } - json.append("], \"references\":[]"); - -// if (extractionResult != null) { -// List<BibDataSet> bibDataSet = extractionResult.getRight(); -// if (bibDataSet != null && bibDataSet.size()>0) { -// DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); -// } -// } + json.append("], \"references\":["); + + if (CollectionUtils.isNotEmpty(extractedEntities)) { + List<BibDataSet> bibDataSet = extractionResult.getRight(); + if (CollectionUtils.isNotEmpty(bibDataSet)) { + DatastetServiceUtils.serializeReferences(json, bibDataSet, extractedEntities); + } + } + json.append("]"); float runtime = ((float)(end-start)/1000); json.append(", \"runtime\": "+ runtime); From 774dd78e8582a8dfd580c9c1d8998dcc4691e5bb Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 22 Oct 2024 04:09:10 +0200 Subject: [PATCH 42/46] fix regression (cherry picked from commit 417280562a7bf0bfb2786d66f15c85ae6bebe34e) --- resources/config/config.yml | 4 ++-- src/main/java/org/grobid/core/engines/DatasetParser.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/config/config.yml b/resources/config/config.yml index 3bb04a3..ac04b76 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -9,7 +9,7 @@ tmpPath: "tmp/" # path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI pub2teiPath: "../../Pub2TEI/" -gluttonHost: "https://cloud.science-miner.com/glutton" +gluttonHost: gluttonPort: # entity-fishing server information for performing entity disambiguation @@ -36,7 +36,7 @@ models: window: 20 nbMaxIterations: 2000 - # classifier model, dataset binary (datset or not dataset in the current sentence) + # classifier model, dataset binary (dataset or not dataset in the current sentence) - name: "dataseer-binary" engine: "delft" delft: diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 94e3381..379cd00 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -873,7 +873,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, String clusterText = LayoutTokensUtil.toText(cluster.concatTokens()); List<LayoutToken> localTokenization = cluster.concatTokens(); - if (CollectionUtils.isNotEmpty(localTokenization)) + if (CollectionUtils.isEmpty(localTokenization)) continue; if (TEIFormatter.MARKER_LABELS.contains(clusterLabel)) { From b18454b5ca149e1bd60a37b6bc83ed6c6083cb6d Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 22 Oct 2024 04:09:29 +0200 Subject: [PATCH 43/46] cosmetics (cherry picked from commit 0a5cedd91434a345bc79657236cf1dbe142650e6) --- .../grobid/core/engines/DatasetParser.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 379cd00..5323e7e 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -234,17 +234,17 @@ public List<List<Dataset>> processing(List<DatasetDocumentSequence> datasetDocum for (Dataset entity : localDatasets) { if (entity.getDatasetName() != null) { String term = entity.getDatasetName().getNormalizedForm(); - if (term == null || term.length() == 0) { - indexToBeFiltered.add(Integer.valueOf(k)); + if (StringUtils.isBlank(term)) { + indexToBeFiltered.add(k); } else if (DatastetLexicon.getInstance().isEnglishStopword(term)) { - indexToBeFiltered.add(Integer.valueOf(k)); + indexToBeFiltered.add(k); } else if (DatastetLexicon.getInstance().isBlackListedNamedDataset(term.toLowerCase())) { - indexToBeFiltered.add(Integer.valueOf(k)); + indexToBeFiltered.add(k); } } k++; } - if (indexToBeFiltered.size() > 0) { + if (CollectionUtils.isNotEmpty(indexToBeFiltered)) { for (int j = indexToBeFiltered.size() - 1; j >= 0; j--) { localDatasets.remove(indexToBeFiltered.get(j).intValue()); } @@ -1596,7 +1596,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do XPath xPath = XPathFactory.newInstance().newXPath(); try { - org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate("//*[local-name() = 'titleStmt']/*[local-name() = 'title']", + org.w3c.dom.Node titleNode = (org.w3c.dom.Node) xPath.evaluate( + "//*[local-name() = 'titleStmt']/*[local-name() = 'title']", doc, XPathConstants.NODE); if (titleNode == null) { @@ -1729,7 +1730,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Annex might contain misclassified relevant sections try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='annex']]/*[local-name() = 'div']"; - org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate( + expression, doc, XPathConstants.NODESET); for (int i = 0; i < bodyNodeList.getLength(); i++) { @@ -1783,14 +1785,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // specific section types statement DatastetAnalyzer datastetAnalyzer = DatastetAnalyzer.getInstance(); - List<String> specificSectionTypesAnnex = Arrays.asList("availability", "acknowledgement", "funding"); + // Looks like acknowledgment and funding may be misleading + List<String> specificSectionTypesAnnex = Arrays.asList("availability", "data-availability"); List<DatasetDocumentSequence> availabilitySequences = new ArrayList<>(); for (String sectionType : specificSectionTypesAnnex) { try { String expression = "//*[local-name() = 'text']/*[local-name() = 'back']/*[local-name() = 'div'][@*[local-name()='type' and .='" + sectionType + "']]/*[local-name() = 'div']/*[local-name() = 'p']"; expression = extractParagraphs ? expression : expression + "/*[local-name() = 's']"; - org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, + org.w3c.dom.NodeList annexNodeList = (org.w3c.dom.NodeList) xPath.evaluate( + expression, doc, XPathConstants.NODESET); for (int i = 0; i < annexNodeList.getLength(); i++) { From 962f7ebd10628643f04906e0b6dc083812fff815 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Tue, 22 Oct 2024 09:46:24 +0200 Subject: [PATCH 44/46] fix regressions in the way we attach references from TEI (cherry picked from commit 1e658fda090cab67a6ad94633683c4ba7e83a014) --- .../grobid/core/engines/DatasetParser.java | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 5323e7e..40283b8 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -1965,16 +1965,25 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // We need to link the references and their callout - List<BiblioComponent> bibRefComponents = new ArrayList<>(); + List<List<BiblioComponent>> referencesAsBiblioComponentSequences = new ArrayList<>(); Map<String, BiblioItem> biblioRefMap = new HashMap<>(); - List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream() - .map(DatasetDocumentSequence::getReferences) - .filter(map -> map.values().stream() - .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) - .toList(); + List<Map<String, Triple<OffsetPosition, String, String>>> referencesInSequences = selectedSequences.stream() + .map(sequence -> sequence.getReferences().entrySet().stream() + .filter(entry -> BIBLIO_CALLOUT_TYPE.equals(entry.getValue().getRight())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))) + .collect(Collectors.toList()); + +// List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream() +// .map(DatasetDocumentSequence::getReferences) +// .filter(map -> map.values().stream() +// .anyMatch(triple -> triple.getRight().equals(BIBLIO_CALLOUT_TYPE))) +// .toList(); + + // We iterate over the sequences, and transform each reference into a BiblioComponent + for (Map<String, Triple<OffsetPosition, String, String>> ref : referencesInSequences) { + List<BiblioComponent> referencesInSequence = new ArrayList<>(); - for (Map<String, Triple<OffsetPosition, String, String>> ref : referencesList) { for (String refText : ref.keySet()) { Triple<OffsetPosition, String, String> infos = ref.get(refText); @@ -1984,19 +1993,22 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); - refText = refText.replaceAll("[\\[\\], ]+", ""); + String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); - biblioRefMap.put(refText, biblioItem); - BiblioComponent biblioComponent = new BiblioComponent(biblioItem, Integer.parseInt(target.replace("b", ""))); + biblioRefMap.put(refTextClean, biblioItem); + BiblioComponent biblioComponent = new BiblioComponent( + biblioItem, Integer.parseInt(target.replace("b", "")) + ); biblioComponent.setRawForm(refText); biblioComponent.setOffsetStart(position.start); biblioComponent.setOffsetEnd(position.end); // TODO: fetch the coords if they are in the TEI // List<BoundingBox> boundingBoxes = BoundingBoxCalculator.calculate(refTokens); // biblioComponent.setBoundingBoxes(boundingBoxes); - bibRefComponents.add(biblioComponent); + referencesInSequence.add(biblioComponent); } } + referencesAsBiblioComponentSequences.add(referencesInSequence); } // Dataset Recognition @@ -2143,9 +2155,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do // Enhance information in dataset entities - if (CollectionUtils.isNotEmpty(bibRefComponents)) { + if (CollectionUtils.isNotEmpty(referencesAsBiblioComponentSequences)) { // attach references to dataset entities - entities = attachRefBibSimple(entities, bibRefComponents); + entities = attachRefBibSimple(entities, referencesAsBiblioComponentSequences); } // consolidate the attached ref bib (we don't consolidate all bibliographical references @@ -2395,36 +2407,39 @@ public List<List<Dataset>> attachRefBib(List<List<Dataset>> entities, List<Bibli * Try to attach relevant bib ref component to dataset entities, this does not use the global offset as in the * TEI all references' offsets are local to the sentence */ - public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents) { - return attachRefBib(entities, refBibComponents, 5); + public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<List<BiblioComponent>> refBibComponents) { + return attachRefBibSimple(entities, refBibComponents, 5); } - public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> entities, List<BiblioComponent> refBibComponents, int distance) { + public List<List<Dataset>> attachRefBibSimple(List<List<Dataset>> datasetsSequences, List<List<BiblioComponent>> referencesSequences, int distance) { // we anchor the process to the dataset names and aggregate other closest components on the right // if we cross a bib ref component we attach it, if a bib ref component is just after the last // component of the entity group, we attach it - for (List<Dataset> datasets : entities) { - for (Dataset entity : datasets) { - if (entity.getDatasetName() == null) + for (int seqIdx = 0; seqIdx < datasetsSequences.size(); seqIdx++) { + List<Dataset> datasets = datasetsSequences.get(seqIdx); + List<BiblioComponent> references = referencesSequences.get(seqIdx); + + for (Dataset dataset : datasets) { + if (dataset.getDatasetName() == null) continue; // find the name component and the offset - DatasetComponent nameComponent = entity.getDatasetName(); - int pos = nameComponent.getOffsetEnd(); + DatasetComponent nameComponent = dataset.getDatasetName(); + int datasetEndPosition = nameComponent.getOffsetEnd(); - // find included or just next bib ref callout - List<BiblioComponent> relatedReferences = refBibComponents.stream() - .filter(ref -> ref.getOffsetStart() >= pos && ref.getOffsetEnd() <= pos + distance) + // find included or just next bib ref callout within a distance of 5 characters + List<BiblioComponent> relatedReferences = references.stream() + .filter(ref -> ref.getOffsetStart() >= datasetEndPosition && ref.getOffsetStart() <= datasetEndPosition + distance) .collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(relatedReferences)) { - entity.setBibRefs(relatedReferences); + dataset.setBibRefs(relatedReferences); } } } - return entities; + return datasetsSequences; } public List<List<OffsetPosition>> preparePlaceTaken(List<List<Dataset>> entities) { From 3b343c6fc9867a65df7ca19a2433961b154cc390 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Wed, 1 Jan 2025 18:20:09 +0100 Subject: [PATCH 45/46] allow xml:id to be string using a wrapper that generates integer to maintain the compatibility with the rest of the processing --- Readme.md | 17 +- .../org/grobid/core/data/BiblioComponent.java | 2 +- .../core/data/BiblioComponentWrapper.java | 49 ++ .../grobid/core/engines/DatasetParser.java | 32 +- .../grobid/core/utilities/XMLUtilities.java | 61 +- ...om scientific literature.with_urls.tei.xml | 760 ------------------ 6 files changed, 128 insertions(+), 793 deletions(-) create mode 100644 src/main/java/org/grobid/core/data/BiblioComponentWrapper.java delete mode 100644 src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml diff --git a/Readme.md b/Readme.md index fb9056f..ce73e87 100644 --- a/Readme.md +++ b/Readme.md @@ -200,7 +200,22 @@ curl --form input=@./src/test/resources/PMC1636350.pdf --form disambiguate=1 loc For PDF, each entity will be associated with a list of bounding box coordinates relative to the PDF, see [here](https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/#coordinate-system-in-the-pdf) for more explanation about the coordinate system. -In addition, the response will contain the bibliographical reference information associated to a dataset mention when found. The bibliographical information are provided in XML TEI (similar format as GROBID). +In addition, the response will contain the bibliographical reference information associated to a dataset mention when found. +The bibliographical information are provided in XML TEI (similar format as GROBID). + +#### /service/annotateDatasetTEI + +This entry-point consumes the TEI-XML file from Grobid or pub2tei. + +| method | request type | response type | parameters | requirement | description | +|--------|-----------------------|--------------------|--------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| POST | `multipart/form-data` | `application/json` | `input` | required | TEI file to be processed | +| | | | `segmentSentences` | optional | Indicate whether to apply sentence segmentation. If the TEI was segmented before (by Grobid, for example) this should be set to '0'. | + +[//]: # (| | | | `disambiguate` | optional | `disambiguate` is a string of value `0` (no disambiguation, default value) or `1` (disambiguate and inject Wikidata entity id and Wikipedia pageId) |) + + +Using ```curl``` POST request with a __TEI-XML file__: ## Contact and License diff --git a/src/main/java/org/grobid/core/data/BiblioComponent.java b/src/main/java/org/grobid/core/data/BiblioComponent.java index 27887e4..821bc61 100644 --- a/src/main/java/org/grobid/core/data/BiblioComponent.java +++ b/src/main/java/org/grobid/core/data/BiblioComponent.java @@ -29,7 +29,7 @@ public class BiblioComponent extends DatasetComponent { // the full matched bibliographical reference record protected BiblioItem biblio = null; - // identifier for relating callout and reference, should be cconsistent with + // identifier for relating callout and reference, should be consistent with // a full text TEI produced by GROBID protected int refKey = -1; diff --git a/src/main/java/org/grobid/core/data/BiblioComponentWrapper.java b/src/main/java/org/grobid/core/data/BiblioComponentWrapper.java new file mode 100644 index 0000000..8d110a2 --- /dev/null +++ b/src/main/java/org/grobid/core/data/BiblioComponentWrapper.java @@ -0,0 +1,49 @@ +package org.grobid.core.data; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +public class BiblioComponentWrapper { + private Map<String, Integer> stringToRefKeyMap; + private Map<Integer, String> refKeyToStringMap; + private AtomicInteger refKeyGenerator; + + public BiblioComponentWrapper() { + stringToRefKeyMap = new HashMap<>(); + refKeyToStringMap = new HashMap<>(); + refKeyGenerator = new AtomicInteger(0); + } + + public void addMapping(String refKeyString) { + if (!stringToRefKeyMap.containsKey(refKeyString)) { + int refKey = refKeyGenerator.incrementAndGet(); + stringToRefKeyMap.put(refKeyString, refKey); + refKeyToStringMap.put(refKey, refKeyString); + } + } + + public Integer getRefKey(String refKeyString) { + String refKeyStringClean = refKeyString.replaceFirst("^#", ""); + addMapping(refKeyStringClean); + return stringToRefKeyMap.get(refKeyStringClean); + } + + public String getRefKeyString(int refKey) { + return refKeyToStringMap.get(refKey); + } + + public void removeMapping(String refKeyString) { + Integer refKey = stringToRefKeyMap.remove(refKeyString); + if (refKey != null) { + refKeyToStringMap.remove(refKey); + } + } + + public void removeMapping(int refKey) { + String refKeyString = refKeyToStringMap.remove(refKey); + if (refKeyString != null) { + stringToRefKeyMap.remove(refKeyString); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 40283b8..60125b6 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -962,7 +962,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file, TaggingLabel clusterLabel = cluster.getTaggingLabel(); List<LayoutToken> localTokenization = cluster.concatTokens(); - if ((localTokenization == null) || (localTokenization.size() == 0)) + if (CollectionUtils.isEmpty(localTokenization)) continue; if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { @@ -1937,7 +1937,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do } // Read and parse references - Map<String, Pair<String, org.w3c.dom.Node>> referenceMap = new HashMap<>(); + + BiblioComponentWrapper biblioComponentWrapper = new BiblioComponentWrapper(); + + Map<Integer, Pair<String, org.w3c.dom.Node>> referenceMap = new HashMap<>(); try { String expression = "//*[local-name() = 'div'][@*[local-name()='type' and .='references']]/*[local-name() = 'listBibl']/*[local-name() = 'biblStruct']"; org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression, @@ -1953,7 +1956,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do String referenceText = item.getTextContent(); String normalizedReferenceText = normalize(referenceText); String cleanedRawReferenceText = normalizedReferenceText.replaceAll("\\p{Space}+", " ").strip().replaceAll("[ ]{2,}", ", "); - referenceMap.put(attribute.getNodeValue(), Pair.of(cleanedRawReferenceText, item)); + referenceMap.put(biblioComponentWrapper.getRefKey(attribute.getNodeValue()), Pair.of(cleanedRawReferenceText, item)); } } } @@ -1974,6 +1977,18 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))) .collect(Collectors.toList()); +// List<Map<String, Triple<OffsetPosition, String, String>>> referencesInSequences = selectedSequences.stream() +// .map(sequence -> sequence.getReferences().entrySet().stream() +// .filter(entry -> BIBLIO_CALLOUT_TYPE.equals(entry.getValue().getRight())) +// .collect( +// Collectors.toMap( +// entry -> String.valueOf(biblioComponentWrapper.getRefKey(entry.getValue().getMiddle())), +// Map.Entry::getValue +// ) +// ) +// ) +// .collect(Collectors.toList()); + // List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream() // .map(DatasetDocumentSequence::getReferences) // .filter(map -> map.values().stream() @@ -1990,15 +2005,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do String target = infos.getMiddle(); OffsetPosition position = infos.getLeft(); - Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target); + Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target)); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); biblioRefMap.put(refTextClean, biblioItem); - BiblioComponent biblioComponent = new BiblioComponent( - biblioItem, Integer.parseInt(target.replace("b", "")) - ); + + Integer refKey = biblioComponentWrapper.getRefKey(target); + + BiblioComponent biblioComponent = new BiblioComponent(biblioItem, refKey); biblioComponent.setRawForm(refText); biblioComponent.setOffsetStart(position.start); biblioComponent.setOffsetEnd(position.end); @@ -2238,7 +2254,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do return Pair.of(entities, citationsToConsolidate); } - private static String normalize(String text) { + public static String normalize(String text) { String normalizedText = UnicodeUtil.normaliseText(text); normalizedText = normalizedText.replace("\n", " "); normalizedText = normalizedText.replace("\t", " "); diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 9532bcf..d5d5a95 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -33,6 +33,8 @@ import java.io.StringWriter; import java.util.*; +import static org.grobid.core.engines.DatasetParser.normalize; + /** * Some convenient methods for suffering a bit less with XML. */ @@ -82,7 +84,7 @@ public static String toPrettyString(String xml, int indent) { public static Element getFirstDirectChild(Element parent, String name) { for(Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) { - if (child instanceof Element && name.equals(child.getNodeName())) + if (child instanceof Element && name.equals(child.getNodeName())) return (Element) child; } return null; @@ -91,8 +93,8 @@ public static Element getFirstDirectChild(Element parent, String name) { public static Element getLastDirectChild(Element parent, String name) { NodeList children = parent.getChildNodes(); for(int j=children.getLength()-1; j>0; j--) { - Node child = children.item(j); - if (child instanceof Element && name.equals(child.getNodeName())) + Node child = children.item(j); + if (child instanceof Element && name.equals(child.getNodeName())) return (Element) child; } return null; @@ -123,7 +125,7 @@ public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElemen } catch(Exception e) { if (teiXML != null) LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + teiXML); - else + else LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + biblStructElement.toString()); } return handler.getBiblioItem(); @@ -138,7 +140,7 @@ public static String getTextNoRefMarkers(Element element) { if (node.getNodeType() == Node.ELEMENT_NODE) { if ("ref".equals(node.getNodeName())) continue; - } + } if (node.getNodeType() == Node.TEXT_NODE) { buf.append(node.getNodeValue()); found = true; @@ -147,6 +149,19 @@ public static String getTextNoRefMarkers(Element element) { return found ? buf.toString() : null; } + public static String getTextRecursively(Node node) { + StringBuilder textContent = new StringBuilder(); + NodeList children = node.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() == Node.TEXT_NODE) { + textContent.append(child.getNodeValue()); + } else if (child.getNodeType() == Node.ELEMENT_NODE) { + textContent.append(getTextRecursively(child)); + } + } + return textContent.toString(); + } /** * @return Pair with text or null on the left and a Triple with (position, target and type) */ @@ -181,16 +196,16 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g for (int j = 0; j < list2.getLength(); j++) { Node subChildNode = list2.item(j); if (subChildNode.getNodeType() == Node.TEXT_NODE) { - String chunk = subChildNode.getNodeValue(); + String chunk = normalize(getTextRecursively(node)); if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); - right.put(chunk, refInfo); + right.put(StringUtils.strip(chunk), refInfo); String holder = StringUtils.repeat(" ", chunk.length()); buf.append(holder); } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); - right.put(chunk, urlInfo); + right.put(StringUtils.strip(chunk), urlInfo); // we still add added like normal text buf.append(chunk); found = true; @@ -254,8 +269,8 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { XPathFactory xpathFactory = XPathFactory.newInstance(); // XPath to find empty text nodes. XPathExpression xpathExp = xpathFactory.newXPath().compile( - "//text()[normalize-space(.) = '']"); - NodeList emptyTextNodes = (NodeList) + "//text()[normalize-space(.) = '']"); + NodeList emptyTextNodes = (NodeList) xpathExp.evaluate(doc, XPathConstants.NODESET); // Remove each empty text node from document. @@ -368,7 +383,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { // Return pretty print xml string StringWriter stringWriter = new StringWriter(); transformer.transform(new DOMSource(document), new StreamResult(stringWriter)); - + // write result to file FileUtils.writeStringToFile(outputFile, stringWriter.toString(), "UTF-8"); @@ -386,7 +401,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { /** * Return the document ID where the annotation is located - */ + */ private static String getDocIdFromRs(org.w3c.dom.Node node) { String result = null; // first go up to the tei element root @@ -423,11 +438,11 @@ private static String getDocIdFromRs(org.w3c.dom.Node node) { } public static String stripNonValidXMLCharacters(String in) { - StringBuffer out = new StringBuffer(); - char current; + StringBuffer out = new StringBuffer(); + char current; - if (in == null || ("".equals(in))) - return ""; + if (in == null || ("".equals(in))) + return ""; for (int i = 0; i < in.length(); i++) { current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen. if ((current == 0x9) || @@ -439,7 +454,7 @@ public static String stripNonValidXMLCharacters(String in) { out.append(current); } return out.toString(); - } + } private static List<String> textualElements = Arrays.asList("p", "figDesc"); @@ -451,7 +466,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { final NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { final Node n = children.item(i); - if ( (n.getNodeType() == Node.ELEMENT_NODE) && + if ( (n.getNodeType() == Node.ELEMENT_NODE) && (textualElements.contains(n.getNodeName())) ) { // text content //String text = n.getTextContent(); @@ -492,7 +507,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); - org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(fullSent))); + org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(fullSent))); } catch(Exception e) { fail = true; } @@ -509,7 +524,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { //System.out.println("-----------------"); sent = sent.replace("\n", " "); sent = sent.replaceAll("( )+", " "); - + //Element sentenceElement = doc.createElement("s"); //sentenceElement.setTextContent(sent); //newNodes.add(sentenceElement); @@ -539,12 +554,12 @@ public static void segment(org.w3c.dom.Document doc, Node node) { if (n.getNodeName().equals("figDesc")) { Element theDiv = doc.createElementNS("http://www.tei-c.org/ns/1.0", "div"); Element theP = doc.createElementNS("http://www.tei-c.org/ns/1.0", "p"); - for(Node theNode : newNodes) + for(Node theNode : newNodes) theP.appendChild(theNode); theDiv.appendChild(theP); n.appendChild(theDiv); } else { - for(Node theNode : newNodes) + for(Node theNode : newNodes) n.appendChild(theNode); } @@ -561,7 +576,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { * @param args Command line arguments. */ public static void main(String[] args) { - + // we are expecting one argument, absolute path to the TEICorpus document if (args.length != 1) { diff --git a/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml b/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml deleted file mode 100644 index 2ab3daa..0000000 --- a/src/test/resources/org/grobid/core/engines/Semi-automatic staging area for high-quality structured data extraction from scientific literature.with_urls.tei.xml +++ /dev/null @@ -1,760 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink"> - <teiHeader xml:lang="en"> - <fileDesc> - <titleStmt> - <title level="a" type="main">Science and Technology of Advanced Materials: Methods - - MEXT - - - unknown - - - Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) - - - - - - - - 14 Dec 2023. - - - - - - LucaFoppiano - 0000-0002-6114-6164 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
- - Knowledge and Data Engineering - Centre for Computational Sciences - University of Tsukuba -
- Tsukuba - Japan; -
-
-
- - TomoyaMato - 0000-0002-0918-6468 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
-
- - KenseiTerashima - 0000-0003-0375-3043 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - PedroOrtiz Suarez - 0000-0003-0343-8852 - - GmbH DFKI - CONTACT Luca Foppiano -
- Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii - 0000-0003-0375 - Berlin - DE -
-
-
- - TakuTou - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - ChikakoSakai - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - Wei-ShengWang - 0009-0001-3572-5736 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - ToshiyukiAmagasa - 0000-0003-0595-2230 - - Knowledge and Data Engineering - Centre for Computational Sciences - University of Tsukuba -
- Tsukuba - Japan; -
-
-
- - YoshihikoTakano - 0000-0002-1541-6928 - - Frontier Superconducting Materials Group - MANA - NIMS -
- Tsukuba - Japan; -
-
-
- - MasashiIshii - ishii.masashi@nims.go.jp - 0000-0003-0357-2832 - - Materials Modelling Group - Centre for Basic Research on Materials - Data-driven Materials Research Field - NIMS -
- Tsukuba - Japan; -
-
-
- - Masashi - - Science and Technology of Advanced Materials: Methods -
- - Print - - 14 Dec 2023. - - - DCB0425EE18794E34CC3A3075E3E3975 - 10.1080/27660400.2023.2286219 - Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023 -
-
- - - - - GROBID - A machine learning software for extracting information from scholarly documents - - - - - - - - Materials informatics - superconductors - machine learning - database - TDM - - - -

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

-
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

-
-
- - - -
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

-
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

-
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

-
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

-
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

-
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

-
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

-
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

-
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

-
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

-
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

-
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

-
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

-
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

-
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

-
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

-
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

-
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

-
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

-
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

-
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
-
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
-
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
-
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
-
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
-
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
-
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
-
-
c classification: The temperature is not correctly classified
-
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

-
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
-
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
-
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

-

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

- - - -
-
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

-
-
-

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

-
-
-
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

-
- - - 305-0044 - - -
-
Disclosure statement

No potential conflict of interest was reported by the author(s).

-
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

-
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

-
- - - - - - Automatic extraction of materials and properties from superconductors scientific literature - - LFoppiano - - - PBCastro - - - POSuarez - - 10.1080/27660400.2022.2153633 - - - Sci Technol Adv Mater - - 3 - 1 - 2153633 - 2023 - - - - - - - Materials discovery with machine learning and knowledge discovery - - ONOliveira - - - MJOliveira - - 10.3389/fchem.2022.930369 - - - Front Chem - - 10 - 10 - 2022 - - - - - - - Commentary: the materials project: a materials genome approach to accelerating materials innovation - - AJain - - - SPOng - - - GHautier - - 10.1063/1.4812323 - - - APL Mater - - 1 - 1 - 11002 - 2013 - - - - - - - Aflow: an automatic framework for high-throughput materials discovery - - SCurtarolo - - - WSetyawan - - - GLHart - - - - - Comput Mater Sci - - 58 - - 2012 - - - - - - - The nomad laboratory: from data sharing to artificial intelligence - - CDraxl - - - MScheffler - - 10.1088/2515-7639/ab13bb - - - J Phys Mater - - 2 - 3 - 36001 - 2019 - - - - - - - Global publication productivity in materials science research: a scientometric analysis - - TPratheepan - - - - - Indian J Inf Sources Serv - - 9 - 1 - - 2019 Feb - - - - - - - The PAULING FILE project and materials platform for data science: from big data toward materials genome - - EBlokhin - - - PVillars - - 10.1007/978-3-319-42913-7_62-1 - - 2018 - Springer International Publishing - - Cham - - - - - - - Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases - - MIshii - - - KSakamoto - - 10.1080/27660400.2023.2223051 - - - Sci Technol Adv Mater - - 3 - 1 - 2223051 - 2023 - - - - - - - Predicting new superconductors and their critical temperatures using machine learning - - BRoter - - - SDordevic - - 10.1016/j.physc.2020.1353689 - - - Phys C - - 575 - 1353689 - 2020 - - - - - - - Machine learning modeling of superconducting critical temperature - - VStanev - - - COses - - - AKusne - - 10.1038/s41524-018-0085-8 - - - Npj Comput Mater - - 4 - 1 - 4 - 2017 - - - - - - - Machine-learning approach for discovery of conventional superconductors - - HTran - - - TNVu - - arXiv:221103265. 2022 - - - arXiv preprint - - - - - Deep learning model for finding new superconductors - - TKonno - - - HKurokawa - - - FNabeshima - - 10.1103/PhysRevB.103.014509 - - - Phys Rev B - - 103 - 1 - 14509 - 2021 - - - - - - - The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation - - JCKlie - - - MBugert - - - BBoullosa - - - - - Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations - the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
- - 2018 - - -
-
- - - - Doccano: text annotation tool for human - - HNakayama - - - TKubo - - - JKamura - - - - - Software - - 2018 - - - - - - - Python materials genomics pymatgen: a robust open-source python library for materials analysis - - SPOng - - - WDRichards - - - AJain - - 10.1016/j.commatsci.2012.10.028 - - - Comput Mater Sci - - 68 - 2 - - 2013 - - - - - - - Text-mined dataset of inorganic materials synthesis recipes. Sci Data - - OKononova - - - HHuo - - - THe - - 10.1038/s41597-019-0224-1 - 41597-019-0224-1 - - - 2019 Oct - 6 - 203 - - - - - - - Label studio: data labeling software; 2020-2022 - - MTkachenko - - - MMalyuk - - - AHolmanyuk - - - - - Open source software - - - - - - - Supermat: construction of a linked annotated dataset from superconductors-related publications - - LFoppiano - - - SDieb - - - ASuzuki - - 10.1080/27660400.2021.1918396 - - - Sci Technol Adv Mater: Methods - - 1 - 1 - - 2021 - - - - - - - SciBERT: a pretrained language model for scientific text - - IBeltagy - - - KLo - - - ACohan - - - - - Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing - the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
- - Association for Computational Linguistics - Nov. 2019 - - -
-
- - - - - <ptr target="https://github.com/kermitt2/delft"/> - </analytic> - <monogr> - <title level="j">DeLFT contributors. Delft - - 2018-2023 - - - - - - - Overcoming catastrophic forgetting in neural networks - - JKirkpatrick - - - RPascanu - - - NCRabinowitz - - abs/1612.00796 - - - - CoRr - - 2016 - - - - - - - - <author> - <persName><forename type="first">G</forename><surname>Contributors</surname></persName> - </author> - <author> - <persName><surname>Grobid</surname></persName> - </author> - <ptr target="https://github.com/kermitt2/grobid"/> - <imprint> - <date type="published" when="2008">2008 -2023</date> - </imprint> - </monogr> -</biblStruct> - - </listBibl> - </div> - </back> - </text> -</TEI> \ No newline at end of file From f58c4930ff85c5bf413e3e26fdd93a1c8e420ab1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Thu, 2 Jan 2025 19:48:38 +0100 Subject: [PATCH 46/46] fix extraction of urls that are not well formed (supplementary-material generated by pub2tei) (cherry picked from commit 39c0e43ee5d6cfb5a2f4d7ef0439ec814cceb73c) --- .../core/engines/DatasetDisambiguator.java | 77 +++++-------------- .../grobid/core/engines/DatasetParser.java | 6 +- .../grobid/core/utilities/XMLUtilities.java | 4 +- 3 files changed, 27 insertions(+), 60 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index 4744b25..c1b78e7 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -1,73 +1,36 @@ package org.grobid.core.engines; -import nu.xom.Attribute; -import nu.xom.Element; +import com.fasterxml.jackson.core.io.JsonStringEncoder; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpEntity; import org.apache.http.client.config.RequestConfig; -import org.grobid.core.GrobidModels; -import org.grobid.core.data.DatasetComponent; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.conn.HttpHostConnectException; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.mime.HttpMultipartMode; +import org.apache.http.entity.mime.MultipartEntityBuilder; +import org.apache.http.entity.mime.content.StringBody; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; import org.grobid.core.data.Dataset; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.xml.XmlBuilderUtils; -import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.DatasetTaggingLabels; -import org.grobid.core.engines.label.SegmentationLabels; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorDataseer; -import org.grobid.core.layout.BoundingBox; +import org.grobid.core.data.DatasetComponent; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.lexicon.DatastetLexicon; import org.grobid.core.utilities.DatastetConfiguration; -import org.grobid.core.utilities.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.net.HttpURLConnection; +import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; - -import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.UrlEncodedFormEntity; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.message.BasicNameValuePair; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.HttpEntity; -import org.apache.http.util.EntityUtils; -import org.apache.http.entity.mime.content.StringBody; -import org.apache.http.entity.ContentType; -import org.apache.http.entity.mime.MultipartEntityBuilder; -import org.apache.http.entity.mime.HttpMultipartMode; -import org.apache.http.conn.HttpHostConnectException; -import org.apache.commons.lang3.tuple.Pair; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; +import java.util.*; /** * Dataset entity disambiguator. Once dataset mentions are recognized and grouped diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 60125b6..39cffe6 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -550,7 +550,11 @@ private List<DatasetComponent> addUrlComponentsAsReferences(DatasetDocumentSeque String target = urlInfos.getMiddle(); // String type = urlInfos.getRight(); - DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end)); + String sequenceText = sequence.getText(); + if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) { + continue; + } + DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end)); urlComponent.setOffsetStart(pos.start); urlComponent.setOffsetEnd(pos.end); if (target != null) { diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index d5d5a95..63aebab 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -223,7 +223,7 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g for (int j = 0; j < list2.getLength(); j++) { Node node2 = list2.item(j); if (node2.getNodeType() == Node.TEXT_NODE) { - String chunk = node2.getNodeValue(); + String chunk = normalize(node2.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length(); @@ -231,7 +231,7 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g } } } else if (node.getNodeType() == Node.TEXT_NODE) { - String chunk = node.getNodeValue(); + String chunk = normalize(node.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length();