From 33320d98de9e8b5c695d1c62ea6753a7927c8637 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Wed, 20 Apr 2022 19:22:39 +0200
Subject: [PATCH 1/4] chg: added data loaders for 2014 - 2022

---
 biodatasets/bioasq_task_a/bioasq_task_a.py | 265 +++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 biodatasets/bioasq_task_a/bioasq_task_a.py

diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py
new file mode 100644
index 00000000..e4ecdc13
--- /dev/null
+++ b/biodatasets/bioasq_task_a/bioasq_task_a.py
@@ -0,0 +1,265 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: FIXME: Add a description
+"""
+This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
+
+When modifying it for your dataset, look for TODO items that offer specific instructions.
+
+Full documentation on writing dataset loading scripts can be found here:
+https://huggingface.co/docs/datasets/add_dataset.html
+
+To create a dataset loading script you will create a class and implement 3 methods:
+  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
+  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
+  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
+
+TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
+
+[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+"""
+
+import ijson
+import json
+from pathlib import Path
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+_CITATION = """\
+@article{tsatsaronis2015overview,
+    title        = {
+        An overview of the BIOASQ large-scale biomedical semantic indexing and
+        question answering competition
+    },
+    author       = {
+        Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos
+        and Partalas, Ioannis and Zschunke, Matthias and Alvers, Michael R and
+        Weissenborn, Dirk and Krithara, Anastasia and Petridis, Sergios and
+        Polychronopoulos, Dimitris and others
+    },
+    year         = 2015,
+    journal      = {BMC bioinformatics},
+    publisher    = {BioMed Central Ltd},
+    volume       = 16,
+    number       = 1,
+    pages        = 138
+}
+"""
+
+_DATASETNAME = "bioasq_task_a"
+
+# TODO: Find description and copy it
+_BIOASQ_2014A_DESCRIPTION = ""
+_BIOASQ_2014bA_DESCRIPTION = ""
+
+_BIOASQ_2015A_DESCRIPTION = ""
+_BIOASQ_2015bA_DESCRIPTION = ""
+
+_BIOASQ_2016A_DESCRIPTION = ""
+_BIOASQ_2016bA_DESCRIPTION = ""
+
+_BIOASQ_2017A_DESCRIPTION = ""
+_BIOASQ_2018A_DESCRIPTION = ""
+_BIOASQ_2019A_DESCRIPTION = ""
+_BIOASQ_2020A_DESCRIPTION = ""
+_BIOASQ_2021A_DESCRIPTION = ""
+_BIOASQ_2022A_DESCRIPTION = ""
+
+_DESCRIPTION = {
+    "bioasq_2014a": _BIOASQ_2014A_DESCRIPTION,
+    "bioasq_2014ba": _BIOASQ_2014bA_DESCRIPTION,
+    "bioasq_2015a": _BIOASQ_2015A_DESCRIPTION,
+    "bioasq_2015ba": _BIOASQ_2015bA_DESCRIPTION,
+    "bioasq_2016a": _BIOASQ_2016A_DESCRIPTION,
+    "bioasq_2016ba": _BIOASQ_2016bA_DESCRIPTION,
+    "bioasq_2017a": _BIOASQ_2017A_DESCRIPTION,
+    "bioasq_2018a": _BIOASQ_2018A_DESCRIPTION,
+    "bioasq_2019a": _BIOASQ_2019A_DESCRIPTION,
+    "bioasq_2020a": _BIOASQ_2020A_DESCRIPTION,
+    "bioasq_2021a": _BIOASQ_2021A_DESCRIPTION,
+    "bioasq_2022a": _BIOASQ_2022A_DESCRIPTION,
+}
+
+_HOMEPAGE = "http://participants-area.bioasq.org/datasets/"
+
+# Data access requires prior registration with BioASQ.
+# See http://participants-area.bioasq.org/accounts/register/
+_LICENSE = "https://www.nlm.nih.gov/databases/download/terms_and_conditions.html"
+
+# TODO: FIXME: Add bioasq 2013
+_URLS = {
+    "bioasq_2014a": "allMeSH.zip",
+    "bioasq_2014ba": "allMeSH_limitjournals.zip",
+    "bioasq_2015a": "allMeSH.zip",
+    "bioasq_2015ba": "allMeSH_limitjournals.zip",
+    "bioasq_2016a": "allMeSH_2016.zip",
+    "bioasq_2016ba": "allMeSH_limitjournals_2016.zip",
+    "bioasq_2017a": "allMeSH_2017.zip",
+    "bioasq_2018a": "allMeSH_2018.zip",
+    "bioasq_2019a": "allMeSH_2019.zip",
+    "bioasq_2020a": "allMeSH_2020.zip",
+    "bioasq_2021a": "allMeSH_2021.zip",
+    "bioasq_2022a": "allMeSH_2022.zip",
+}
+
+_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION]
+
+_SOURCE_VERSION = "1.0.0"
+_BIGBIO_VERSION = "1.0.0"
+
+
+class BioasqTaskADataset(datasets.GeneratorBasedBuilder):
+    """
+    BioASQ Task A On Biomedical Text Classification.
+    Creates configs for BioASQ A 2013 through BioASQ A 2021.
+    """
+
+    DEFAULT_CONFIG_NAME = "bioasq_2014a_source"
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    # BioASQ A 2014 through BioASQ A 2022
+    BUILDER_CONFIGS = []
+    for year in ((
+        "2014",
+        "2014b",
+        "2015",
+        "2015b",
+        "2016",
+        "2016b",
+        "2017",
+        "2018",
+        "2019",
+        "2020",
+        "2021",
+        "2022",
+    )):
+        BUILDER_CONFIGS.extend([
+            BigBioConfig(
+                name=f"bioasq_{year}a_source",
+                version=SOURCE_VERSION,
+                description=f"bioasq {year} Task A source schema",
+                schema="source",
+                subset_id=f"bioasq_{year}a",
+            ),
+            BigBioConfig(
+                name=f"bioasq_{year}a_bigbio_text",
+                version=BIGBIO_VERSION,
+                description=f"bioasq {year} Task A in simplified BigBio schema",
+                schema="bigbio_text",
+                subset_id=f"bioasq_{year}a",
+            )
+        ])
+
+    def _info(self) -> datasets.DatasetInfo:
+        # BioASQ Task A source schema
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "abstractText": datasets.Value("string"),
+                    "journal": datasets.Value("string"),
+                    "meshMajor": [datasets.Value("string")],
+                    "pmid": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "year": datasets.Value("string"),
+                }
+            )
+        # simplified schema for text classification tasks
+        elif self.config.schema == "bigbio_text":
+            features = schemas.text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION[self.config.subset_id],
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+
+        data_dir = self.config.data_dir
+        url = _URLS[self.config.subset_id]
+
+        train_data_dir = dl_manager.download_and_extract(Path(data_dir) / url)
+
+        subset_filepaths = {
+            "bioasq_2014a": "allMeSH.json",
+            "bioasq_2014ba": "allMeSH_limitjournals.json",
+            "bioasq_2015a": "allMeSH.json",
+            "bioasq_2015ba": "allMeSH_limitjournals.json",
+            "bioasq_2016a": "allMeSH_2016.json",
+            "bioasq_2016ba": "allMeSH_limitjournals_2016.json",
+            "bioasq_2017a": "allMeSH_2017.json",
+            "bioasq_2018a": "allMeSH_2018.json",
+            "bioasq_2019a": "allMeSH_2019.json",
+            "bioasq_2020a": "allMeSH_2020.json",
+            "bioasq_2021a": "allMeSH_2021.json",
+            "bioasq_2022a": "allMeSH_2022.json",
+        }
+        filepath = Path(train_data_dir) / subset_filepaths[self.config.subset_id]
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": filepath,
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_articles(self, filepath):
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+            if self.config.subset_id in ("bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"):
+                article_index = 0
+
+                for line in f:
+                    try:
+                        record = json.loads(line.rstrip(",\n"))
+                    except json.decoder.JSONDecodeError:
+                        # TODO: FIXME: Nicer handling of these lines
+                        if "'articles'" in line:
+                            continue
+                        else:
+                            print("FAILED:", line)
+                            continue
+                    else:
+                        yield article_index, record
+                        article_index += 1
+            else:
+                for article_index, record in enumerate(ijson.items(f, "articles.item")):
+                    yield article_index, record
+
+    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        for record_index, record in self._generate_articles(filepath=filepath):
+            if self.config.schema == "source":
+                yield record_index, record
+            elif self.config.schema == "bigbio_text":
+                yield record_index, {
+                    "id": record["pmid"],
+                    "document_id": record["title"],
+                    "text": record["abstractText"],
+                    "labels": record["meshMajor"],
+                }

From 7df3809be92ea51bf18f9d78e38efedccdfb2314 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Fri, 22 Apr 2022 07:53:20 +0200
Subject: [PATCH 2/4] add description; add year 2013

---
 biodatasets/bioasq_task_a/bioasq_task_a.py | 46 ++++++++++++----------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py
index e4ecdc13..7656f644 100644
--- a/biodatasets/bioasq_task_a/bioasq_task_a.py
+++ b/biodatasets/bioasq_task_a/bioasq_task_a.py
@@ -12,24 +12,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# TODO: FIXME: Add a description
 """
-This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
-
-When modifying it for your dataset, look for TODO items that offer specific instructions.
-
-Full documentation on writing dataset loading scripts can be found here:
-https://huggingface.co/docs/datasets/add_dataset.html
-
-To create a dataset loading script you will create a class and implement 3 methods:
-  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
-  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
-  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
-
-TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
-
-[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+BioASQ Task A On Biomedical Text Classification is based on the standard process
+followed by PubMed to index journal abstracts. This task uses PubMed documents,
+written in English, along with annotated MeSH terms (by human curators)
+that are to be inferred from the documents.
+
+Note that the main difference between datasets from different years, apart from the size,
+is the MeSH terms used. For example the 2015 training datasets contain articles
+where MeSH 2015 have been assigned. Also, for 2014, 2015 and 2016 there are two
+versions of the training data available. The small version (wrt size) consists of
+articles that belong to the pool of journals that the BioASQ team used to select the
+articles for the test data (this was a subset of the available journals). The bigger
+version consists of articles from every available journal. Since 2017 articles for the
+test data will be selected from all available journals, so only one corresponding training data
+set will be available. The evaluation of the results during each year of the challenge
+is performed using the corresponding version of the MeSH terms, thus their usage is highly
+recommended. The training datasets of previous years of the challenge are also available
+for reference reasons. Note that not every MeSH term is covered in the datasets.
+
+Fore more information about the challenge, the organisers and the relevant
+publications please visit: http://bioasq.org/
 """
 
 import ijson
@@ -66,6 +69,7 @@
 _DATASETNAME = "bioasq_task_a"
 
 # TODO: Find description and copy it
+_BIOASQ_2013A_DESCRIPTION = ""
 _BIOASQ_2014A_DESCRIPTION = ""
 _BIOASQ_2014bA_DESCRIPTION = ""
 
@@ -83,6 +87,7 @@
 _BIOASQ_2022A_DESCRIPTION = ""
 
 _DESCRIPTION = {
+    "bioasq_2013a": _BIOASQ_2013A_DESCRIPTION,
     "bioasq_2014a": _BIOASQ_2014A_DESCRIPTION,
     "bioasq_2014ba": _BIOASQ_2014bA_DESCRIPTION,
     "bioasq_2015a": _BIOASQ_2015A_DESCRIPTION,
@@ -103,8 +108,8 @@
 # See http://participants-area.bioasq.org/accounts/register/
 _LICENSE = "https://www.nlm.nih.gov/databases/download/terms_and_conditions.html"
 
-# TODO: FIXME: Add bioasq 2013
 _URLS = {
+    "bioasq_2013a": "allMeSH.zip",
     "bioasq_2014a": "allMeSH.zip",
     "bioasq_2014ba": "allMeSH_limitjournals.zip",
     "bioasq_2015a": "allMeSH.zip",
@@ -138,6 +143,7 @@ class BioasqTaskADataset(datasets.GeneratorBasedBuilder):
     # BioASQ A 2014 through BioASQ A 2022
     BUILDER_CONFIGS = []
     for year in ((
+        "2013",
         "2014",
         "2014b",
         "2015",
@@ -204,6 +210,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         train_data_dir = dl_manager.download_and_extract(Path(data_dir) / url)
 
         subset_filepaths = {
+            "bioasq_2013a": "allMeSH.json",
             "bioasq_2014a": "allMeSH.json",
             "bioasq_2014ba": "allMeSH_limitjournals.json",
             "bioasq_2015a": "allMeSH.json",
@@ -231,14 +238,13 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
 
     def _generate_articles(self, filepath):
         with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
-            if self.config.subset_id in ("bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"):
+            if self.config.subset_id in ("bioasq_2013a", "bioasq_2014a", "bioasq_2014ba", "bioasq_2015a", "bioasq_2015ba"):
                 article_index = 0
 
                 for line in f:
                     try:
                         record = json.loads(line.rstrip(",\n"))
                     except json.decoder.JSONDecodeError:
-                        # TODO: FIXME: Nicer handling of these lines
                         if "'articles'" in line:
                             continue
                         else:

From 4351cb8244ebdeb81564d0455b3224d58d0f895d Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 25 Apr 2022 15:20:03 +0200
Subject: [PATCH 3/4] Adding dataloader for Bioasq Task A 2013-2022

---
 biodatasets/bioasq_task_a/bioasq_task_a.py | 58 +++++++++++++++-------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py
index 7656f644..ff668087 100644
--- a/biodatasets/bioasq_task_a/bioasq_task_a.py
+++ b/biodatasets/bioasq_task_a/bioasq_task_a.py
@@ -68,23 +68,47 @@
 
 _DATASETNAME = "bioasq_task_a"
 
-# TODO: Find description and copy it
-_BIOASQ_2013A_DESCRIPTION = ""
-_BIOASQ_2014A_DESCRIPTION = ""
-_BIOASQ_2014bA_DESCRIPTION = ""
-
-_BIOASQ_2015A_DESCRIPTION = ""
-_BIOASQ_2015bA_DESCRIPTION = ""
-
-_BIOASQ_2016A_DESCRIPTION = ""
-_BIOASQ_2016bA_DESCRIPTION = ""
-
-_BIOASQ_2017A_DESCRIPTION = ""
-_BIOASQ_2018A_DESCRIPTION = ""
-_BIOASQ_2019A_DESCRIPTION = ""
-_BIOASQ_2020A_DESCRIPTION = ""
-_BIOASQ_2021A_DESCRIPTION = ""
-_BIOASQ_2022A_DESCRIPTION = ""
+_DESCRIPTION_TEMPLATE = """\
+The data are intended to be used as training data for BioASQ 10 A, which will take place during {year}.
+There is one file containing the data:
+ - {filename}
+
+The training data sets for this task are available for downloading. They
+contain annotated articles from PubMed, where annotated means that MeSH terms
+have been assigned to the articles by the human curators in PubMed. Table 1
+provides information about the provided datasets. Note that the main difference
+between those datasets among the different years, apart from the size, is the
+MeSH terms used. For example the 2015 training datasets contain articles where
+MeSH 2015 have been assigned. Also, for 2014, 2015 and 2016 there are two
+versions (a and b) of the training data available. The small version (wrt size) consists
+of articles that belong to the pool of journals that the BioASQ team used to
+select the articles for the test data (this was a subset of the available journals).
+The bigger version consists of articles from every available
+journal. Since 2017 articles for the test data will be selected from all
+available journals, so only one corresponding training data set will be
+available. The evaluation of the results during each year of the challenge is
+performed using the corresponding version of the MeSH terms, thus their usage
+is highly recommended. The training datasets of previous years of the challenge
+are also available for reference reasons. Note that not every MeSH term is
+covered in the datasets.
+""".format
+_BIOASQ_2013A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2013, filename="allMeSH.zip")
+
+_BIOASQ_2014A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2014, filename="allMeSH.zip")
+_BIOASQ_2014bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2014, filename="allMeSH_limitjournals.zip")
+
+_BIOASQ_2015A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2015, filename="allMeSH.zip")
+_BIOASQ_2015bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2015, filename="allMeSH_limitjournals.zip")
+
+_BIOASQ_2016A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2016, filename="allMeSH_2016.zip")
+_BIOASQ_2016bA_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2016, filename="allMeSH_limitjournals_2016.zip")
+
+_BIOASQ_2017A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2017, filename="allMeSH_2017.json")
+_BIOASQ_2018A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2018, filename="allMeSH_2018.json")
+_BIOASQ_2019A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2019, filename="allMeSH_2019.json")
+_BIOASQ_2020A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2020, filename="allMeSH_2020.json")
+_BIOASQ_2021A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2021, filename="allMeSH_2021.json")
+_BIOASQ_2022A_DESCRIPTION = _DESCRIPTION_TEMPLATE(year=2022, filename="allMeSH_2022.json")
 
 _DESCRIPTION = {
     "bioasq_2013a": _BIOASQ_2013A_DESCRIPTION,

From 679e9d5e330f23bdedff1597ce01809187ee13c3 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 25 Apr 2022 15:31:25 +0200
Subject: [PATCH 4/4] minor fix, documentation

---
 biodatasets/bioasq_task_a/bioasq_task_a.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/biodatasets/bioasq_task_a/bioasq_task_a.py b/biodatasets/bioasq_task_a/bioasq_task_a.py
index ff668087..80b2e333 100644
--- a/biodatasets/bioasq_task_a/bioasq_task_a.py
+++ b/biodatasets/bioasq_task_a/bioasq_task_a.py
@@ -269,11 +269,11 @@ def _generate_articles(self, filepath):
                     try:
                         record = json.loads(line.rstrip(",\n"))
                     except json.decoder.JSONDecodeError:
-                        if "'articles'" in line:
-                            continue
-                        else:
-                            print("FAILED:", line)
-                            continue
+                        # NOTE: First and last line of 2013, 2014 do not contain valid JSON,
+                        # but also not any relevant data (first line has a single quote
+                        # and the term 'articles=[' and the last line contains
+                        # closing brackets. We skip these irrelevant lines.
+                        continue
                     else:
                         yield article_index, record
                         article_index += 1