From 33320d98de9e8b5c695d1c62ea6753a7927c8637 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Wed, 20 Apr 2022 19:22:39 +0200 Subject: [PATCH 1/4] chg: added data loaders for 2014 - 2022 --- biodatasets/bioasq_task_a/bioasq_task_a.py | 265 +++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 biodatasets/bioasq_task_a/bioasq_task_a.py diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py new file mode 100644 index 00000000..e4ecdc13 --- /dev/null +++ b/biodatasets/bioasq_task_a/bioasq_task_a.py @@ -0,0 +1,265 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: FIXME: Add a description +""" +This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. + +[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +""" + +import ijson +import json +from pathlib import Path +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{tsatsaronis2015overview, + title = { + An overview of the BIOASQ large-scale biomedical semantic indexing and + question answering competition + }, + author = { + Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos + and Partalas, Ioannis and Zschunke, Matthias and Alvers, Michael R and + Weissenborn, Dirk and Krithara, Anastasia and Petridis, Sergios and + Polychronopoulos, Dimitris and others + }, + year = 2015, + journal = {BMC bioinformatics}, + publisher = {BioMed Central Ltd}, + volume = 16, + number = 1, + pages = 138 +} +""" + +_DATASETNAME = "bioasq_task_a" + +# TODO: Find description and copy it +_BIOASQ_2014A_DESCRIPTION = "" +_BIOASQ_2014bA_DESCRIPTION = "" + +_BIOASQ_2015A_DESCRIPTION = "" +_BIOASQ_2015bA_DESCRIPTION = "" + +_BIOASQ_2016A_DESCRIPTION = "" +_BIOASQ_2016bA_DESCRIPTION = "" + +_BIOASQ_2017A_DESCRIPTION = "" +_BIOASQ_2018A_DESCRIPTION = "" +_BIOASQ_2019A_DESCRIPTION = "" +_BIOASQ_2020A_DESCRIPTION = "" +_BIOASQ_2021A_DESCRIPTION = "" +_BIOASQ_2022A_DESCRIPTION = "" + +_DESCRIPTION = { + "bioasq_2014a": _BIOASQ_2014A_DESCRIPTION, + "bioasq_2014ba": _BIOASQ_2014bA_DESCRIPTION, + "bioasq_2015a": _BIOASQ_2015A_DESCRIPTION, + "bioasq_2015ba": _BIOASQ_2015bA_DESCRIPTION, + "bioasq_2016a": _BIOASQ_2016A_DESCRIPTION, + "bioasq_2016ba": _BIOASQ_2016bA_DESCRIPTION, + "bioasq_2017a": _BIOASQ_2017A_DESCRIPTION, + "bioasq_2018a": _BIOASQ_2018A_DESCRIPTION, + "bioasq_2019a": _BIOASQ_2019A_DESCRIPTION, + "bioasq_2020a": _BIOASQ_2020A_DESCRIPTION, + "bioasq_2021a": _BIOASQ_2021A_DESCRIPTION, + "bioasq_2022a": _BIOASQ_2022A_DESCRIPTION, +} + +_HOMEPAGE = "http://participants-area.bioasq.org/datasets/" + +# Data access requires prior registration with BioASQ. +# See http://participants-area.bioasq.org/accounts/register/ +_LICENSE = "https://www.nlm.nih.gov/databases/download/terms_and_conditions.html" + +# TODO: FIXME: Add bioasq 2013 +_URLS = { + "bioasq_2014a": "allMeSH.zip", + "bioasq_2014ba": "allMeSH_limitjournals.zip", + "bioasq_2015a": "allMeSH.zip", + "bioasq_2015ba": "allMeSH_limitjournals.zip", + "bioasq_2016a": "allMeSH_2016.zip", + "bioasq_2016ba": "allMeSH_limitjournals_2016.zip", + "bioasq_2017a": "allMeSH_2017.zip", + "bioasq_2018a": "allMeSH_2018.zip", + "bioasq_2019a": "allMeSH_2019.zip", + "bioasq_2020a": "allMeSH_2020.zip", + "bioasq_2021a": "allMeSH_2021.zip", + "bioasq_2022a": "allMeSH_2022.zip", +} + +_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class BioasqTaskADataset(datasets.GeneratorBasedBuilder): + """ + BioASQ Task A On Biomedical Text Classification. + Creates configs for BioASQ A 2013 through BioASQ A 2021. + """ + + DEFAULT_CONFIG_NAME = "bioasq_2014a_source" + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # BioASQ A 2014 through BioASQ A 2022 + BUILDER_CONFIGS = [] + for year in (( + "2014", + "2014b", + "2015", + "2015b", + "2016", + "2016b", + "2017", + "2018", + "2019", + "2020", + "2021", + "2022", + )): + BUILDER_CONFIGS.extend([ + BigBioConfig( + name=f"bioasq_{year}a_source", + version=SOURCE_VERSION, + description=f"bioasq {year} Task A source schema", + schema="source", + subset_id=f"bioasq_{year}a", + ), + BigBioConfig( + name=f"bioasq_{year}a_bigbio_text", + version=BIGBIO_VERSION, + description=f"bioasq {year} Task A in simplified BigBio schema", + schema="bigbio_text", + subset_id=f"bioasq_{year}a", + ) + ]) + + def _info(self) -> datasets.DatasetInfo: + # BioASQ Task A source schema + if self.config.schema == "source": + features = datasets.Features( + { + "abstractText": datasets.Value("string"), + "journal": datasets.Value("string"), + "meshMajor": [datasets.Value("string")], + "pmid": datasets.Value("string"), + "title": datasets.Value("string"), + "year": datasets.Value("string"), + } + ) + # simplified schema for text classification tasks + elif self.config.schema == "bigbio_text": + features = schemas.text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION[self.config.subset_id], + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + + data_dir = self.config.data_dir + url = _URLS[self.config.subset_id] + + train_data_dir = dl_manager.download_and_extract(Path(data_dir) / url) + + subset_filepaths = { + "bioasq_2014a": "allMeSH.json", + "bioasq_2014ba": "allMeSH_limitjournals.json", + "bioasq_2015a": "allMeSH.json", + "bioasq_2015ba": "allMeSH_limitjournals.json", + "bioasq_2016a": "allMeSH_2016.json", + "bioasq_2016ba": "allMeSH_limitjournals_2016.json", + "bioasq_2017a": "allMeSH_2017.json", + "bioasq_2018a": "allMeSH_2018.json", + "bioasq_2019a": "allMeSH_2019.json", + "bioasq_2020a": "allMeSH_2020.json", + "bioasq_2021a": "allMeSH_2021.json", + "bioasq_2022a": "allMeSH_2022.json", + } + filepath = Path(train_data_dir) / subset_filepaths[self.config.subset_id] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepath, + "split": "train", + }, + ), + ] + + def _generate_articles(self, filepath): + with open(filepath, "r", encoding="utf-8", errors="ignore") as f: + if self.config.subset_id in ("bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"): + article_index = 0 + + for line in f: + try: + record = json.loads(line.rstrip(",\n")) + except json.decoder.JSONDecodeError: + # TODO: FIXME: Nicer handling of these lines + if "'articles'" in line: + continue + else: + print("FAILED:", line) + continue + else: + yield article_index, record + article_index += 1 + else: + for article_index, record in enumerate(ijson.items(f, "articles.item")): + yield article_index, record + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + for record_index, record in self._generate_articles(filepath=filepath): + if self.config.schema == "source": + yield record_index, record + elif self.config.schema == "bigbio_text": + yield record_index, { + "id": record["pmid"], + "document_id": record["title"], + "text": record["abstractText"], + "labels": record["meshMajor"], + } From 7df3809be92ea51bf18f9d78e38efedccdfb2314 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Fri, 22 Apr 2022 07:53:20 +0200 Subject: [PATCH 2/4] add description; add year 2013 --- biodatasets/bioasq_task_a/bioasq_task_a.py | 46 ++++++++++++---------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py index e4ecdc13..7656f644 100644 --- a/biodatasets/bioasq_task_a/bioasq_task_a.py +++ b/biodatasets/bioasq_task_a/bioasq_task_a.py @@ -12,24 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -# TODO: FIXME: Add a description """ -This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. - -When modifying it for your dataset, look for TODO items that offer specific instructions. - -Full documentation on writing dataset loading scripts can be found here: -https://huggingface.co/docs/datasets/add_dataset.html - -To create a dataset loading script you will create a class and implement 3 methods: - * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. - -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. - -[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +BioASQ Task A On Biomedical Text Classification is based on the standard process +followed by PubMed to index journal abstracts. This task uses PubMed documents, +written in English, along with annotated MeSH terms (by human curators) +that are to be inferred from the documents. + +Note that the main difference between datasets from different years, apart from the size, +is the MeSH terms used. For example the 2015 training datasets contain articles +where MeSH 2015 have been assigned. Also, for 2014, 2015 and 2016 there are two +versions of the training data available. The small version (wrt size) consists of +articles that belong to the pool of journals that the BioASQ team used to select the +articles for the test data (this was a subset of the available journals). The bigger +version consists of articles from every available journal. Since 2017 articles for the +test data will be selected from all available journals, so only one corresponding training data +set will be available. The evaluation of the results during each year of the challenge +is performed using the corresponding version of the MeSH terms, thus their usage is highly +recommended. The training datasets of previous years of the challenge are also available +for reference reasons. Note that not every MeSH term is covered in the datasets. + +Fore more information about the challenge, the organisers and the relevant +publications please visit: http://bioasq.org/ """ import ijson @@ -66,6 +69,7 @@ _DATASETNAME = "bioasq_task_a" # TODO: Find description and copy it +_BIOASQ_2013A_DESCRIPTION = "" _BIOASQ_2014A_DESCRIPTION = "" _BIOASQ_2014bA_DESCRIPTION = "" @@ -83,6 +87,7 @@ _BIOASQ_2022A_DESCRIPTION = "" _DESCRIPTION = { + "bioasq_2013a": _BIOASQ_2013A_DESCRIPTION, "bioasq_2014a": _BIOASQ_2014A_DESCRIPTION, "bioasq_2014ba": _BIOASQ_2014bA_DESCRIPTION, "bioasq_2015a": _BIOASQ_2015A_DESCRIPTION, @@ -103,8 +108,8 @@ # See http://participants-area.bioasq.org/accounts/register/ _LICENSE = "https://www.nlm.nih.gov/databases/download/terms_and_conditions.html" -# TODO: FIXME: Add bioasq 2013 _URLS = { + "bioasq_2013a": "allMeSH.zip", "bioasq_2014a": "allMeSH.zip", "bioasq_2014ba": "allMeSH_limitjournals.zip", "bioasq_2015a": "allMeSH.zip", @@ -138,6 +143,7 @@ class BioasqTaskADataset(datasets.GeneratorBasedBuilder): # BioASQ A 2014 through BioASQ A 2022 BUILDER_CONFIGS = [] for year in (( + "2013", "2014", "2014b", "2015", @@ -204,6 +210,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: train_data_dir = dl_manager.download_and_extract(Path(data_dir) / url) subset_filepaths = { + "bioasq_2013a": "allMeSH.json", "bioasq_2014a": "allMeSH.json", "bioasq_2014ba": "allMeSH_limitjournals.json", "bioasq_2015a": "allMeSH.json", @@ -231,14 +238,13 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: def _generate_articles(self, filepath): with open(filepath, "r", encoding="utf-8", errors="ignore") as f: - if self.config.subset_id in ("bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"): + if self.config.subset_id in ("bioasq_2013a", "bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"): article_index = 0 for line in f: try: record = json.loads(line.rstrip(",\n")) except json.decoder.JSONDecodeError: - # TODO: FIXME: Nicer handling of these lines if "'articles'" in line: continue else: From 4351cb8244ebdeb81564d0455b3224d58d0f895d Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 25 Apr 2022 15:20:03 +0200 Subject: [PATCH 3/4] Adding dataloader for Bioasq Task A 2013-2022 --- biodatasets/bioasq_task_a/bioasq_task_a.py | 58 +++++++++++++++------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py index 7656f644..ff668087 100644 --- a/biodatasets/bioasq_task_a/bioasq_task_a.py +++ b/biodatasets/bioasq_task_a/bioasq_task_a.py @@ -68,23 +68,47 @@ _DATASETNAME = "bioasq_task_a" -# TODO: Find description and copy it -_BIOASQ_2013A_DESCRIPTION = "" -_BIOASQ_2014A_DESCRIPTION = "" -_BIOASQ_2014bA_DESCRIPTION = "" - -_BIOASQ_2015A_DESCRIPTION = "" -_BIOASQ_2015bA_DESCRIPTION = "" - -_BIOASQ_2016A_DESCRIPTION = "" -_BIOASQ_2016bA_DESCRIPTION = "" - -_BIOASQ_2017A_DESCRIPTION = "" -_BIOASQ_2018A_DESCRIPTION = "" -_BIOASQ_2019A_DESCRIPTION = "" -_BIOASQ_2020A_DESCRIPTION = "" -_BIOASQ_2021A_DESCRIPTION = "" -_BIOASQ_2022A_DESCRIPTION = "" +_DESCRIPTION_TEMPLATE = """\ +The data are intended to be used as training data for BioASQ 10 A, which will take place during {year}. +There is one file containing the data: + - {filename} + +The training data sets for this task are available for downloading. They +contain annotated articles from PubMed, where annotated means that MeSH terms +have been assigned to the articles by the human curators in PubMed. Table 1 +provides information about the provided datasets. Note that the main difference +between those datasets among the different years, apart from the size, is the +MeSH terms used. For example the 2015 training datasets contain articles where +MeSH 2015 have been assigned. Also, for 2014, 2015 and 2016 there are two +versions (a and b) of the training data available. The small version (wrt size) consists +of articles that belong to the pool of journals that the BioASQ team used to +select the articles for the test data (this was a subset of the available journals). +The bigger version consists of articles from every available +journal. Since 2017 articles for the test data will be selected from all +available journals, so only one corresponding training data set will be +available. The evaluation of the results during each year of the challenge is +performed using the corresponding version of the MeSH terms, thus their usage +is highly recommended. The training datasets of previous years of the challenge +are also available for reference reasons. Note that not every MeSH term is +covered in the datasets. +""".format +_BIOASQ_2013A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2013, filename="allMeSH.zip") + +_BIOASQ_2014A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2014, filename="allMeSH.zip") +_BIOASQ_2014bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2014, filename="allMeSH_limitjournals.zip") + +_BIOASQ_2015A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2015, filename="allMeSH.zip") +_BIOASQ_2015bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2015, filename="allMeSH_limitjournals.zip") + +_BIOASQ_2016A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2016, filename="allMeSH_2016.zip") +_BIOASQ_2016bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2016, filename="allMeSH_limitjournals_2016.zip") + +_BIOASQ_2017A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2017, filename="allMeSH_2017.json") +_BIOASQ_2018A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2018, filename="allMeSH_2018.json") +_BIOASQ_2019A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2019, filename="allMeSH_2019.json") +_BIOASQ_2020A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2020, filename="allMeSH_2020.json") +_BIOASQ_2021A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2021, filename="allMeSH_2021.json") +_BIOASQ_2022A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2022, filename="allMeSH_2022.json") _DESCRIPTION = { "bioasq_2013a": _BIOASQ_2013A_DESCRIPTION, From 679e9d5e330f23bdedff1597ce01809187ee13c3 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 25 Apr 2022 15:31:25 +0200 Subject: [PATCH 4/4] minor fix, documentation --- biodatasets/bioasq_task_a/bioasq_task_a.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py index ff668087..80b2e333 100644 --- a/biodatasets/bioasq_task_a/bioasq_task_a.py +++ b/biodatasets/bioasq_task_a/bioasq_task_a.py @@ -269,11 +269,11 @@ def _generate_articles(self, filepath): try: record = json.loads(line.rstrip(",\n")) except json.decoder.JSONDecodeError: - if "'articles'" in line: - continue - else: - print("FAILED:", line) - continue + # NOTE: First and last line of 2013, 2014 do not contain valid JSON, + # but also not any relevant data (first line has a single quote + # and the term 'articles=[' and the last line contains + # closing brackets. We skip these irrelevant lines. + continue else: yield article_index, record article_index += 1