Add docs for adding tasks to data downloader (#1221)

* add docs for adding task to downloader
nyu-mll · Nov 5, 2020 · 9892766 · 9892766
1 parent e4f1c4b
commit 9892766
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 19 deletions.
diff --git a/guides/tasks/adding_tasks.md b/guides/tasks/adding_tasks.md
@@ -162,9 +162,46 @@ def get_evaluation_scheme_for_task(task) -> BaseEvaluationScheme:
         return SimpleAccuracyEvaluationScheme()
 ```
 
-## 4. Update the supported tasks documentation
+## 4. Add a data downloader
+If the new task is publicly available, add the new task to the data downloader in [`jiant/scripts/download_data/runscript.py`](../../jiant/scripts/download_data/runscript.py). There are two flavors of supported task downloaders:
+
+1) If the new task is supported by Hugging Face's [Datasets](https://huggingface.co/nlp/viewer/):
+    - Add the task to `OTHER_HF_DATASETS_TASKS` in [`jiant/scripts/download_data/constants.py`](../../jiant/scripts/download_data/constants.py). Also add the task to `HF_DATASETS_CONVERSION_DICT` in [`jiant/scripts/download_data/dl_datasets/hf_datasets_tasks.py`](../../jiant/scripts/download_data/dl_datasets/hf_datasets_tasks.py). `HF_DATASETS_CONVERSION_DICT` is used to map field changes in Dataset's to the original dataset.
+
+2) If the task is not supported by Hugging Face's Datasets, and the dataset is publicly available for download (i.e. downloadable directly via wget, and not behind an authentication wall such as Google Drive):
+    - Add a function to directly download the task here: [`jiant/scripts/download_data/dl_datasets/files_tasks.py`](../../jiant/scripts/download_data/dl_datasets/files_tasks.py). The function signature should be similar to:
+
+```python
+def download_senteval_data_and_write_config(
+    task_name: str, task_data_path: str, task_config_path: str
+)
+```
+
+The direct download function for your task should generate output a config object to `task_config_path` based on the data required to complete the task. For Senteval, the task config object is:
+
+```python
+py_io.write_json(
+    data={
+        "task": "senteval",
+        "paths": {
+            "train": os.path.join(task_data_path, "train.jsonl"),
+            "val": os.path.join(task_data_path, "valid.jsonl"),
+            "test": os.path.join(task_data_path, "tests.jsonl"),
+        },
+        "name": "senteval",
+    },
+    path=task_config_path,
+)
+```
+
+## 5. Update the supported tasks documentation
 Add your task to [`guides/tasks/supported_tasks.md`](../../guides/tasks/supported_tasks.md).
 
+
+## 6. (Recommended) Tag the creator in the pull request
+Please @ the creator of the task in your PR to let them know it is part of `jiant`! This also gives them a chance to review the task.
+
+
 ## Congratulations!
 And that’s it. You’ve made all the core code changes required to include the `SentevalTenseTask` in your `jiant` experiments.
 
@@ -182,4 +219,4 @@ What's next? To tokenize and cache your `SentevalTenseTask` (which you shortname
 }
 ```
 
-To learn more about running experiments with you new task, check out the examples [available here](../README.md).
+To learn more about running experiments with you new task, check out the examples [available here](../README.md).
diff --git a/guides/tasks/supported_tasks.md b/guides/tasks/supported_tasks.md
@@ -70,6 +70,7 @@
 | TyDiQA | `tydiqa_{lang}` | ✅ | ✅ | tydiqa | XTREME, multi-lang |
 | UDPOS | `udpos_{lang}` | ✅ | ✅ | udpos | XTREME, multi-lang |
 | WiC | wic | ✅ | ✅ | wic | SuperGLUE |
+| Winogrande | winogrande | ✅ | ✅ | winogrande | |
 | WNLI | wnli | ✅ | ✅ | wnli | GLUE |
 | WSC | wsc | ✅ | ✅ | wsc | SuperGLUE |
 | XNLI | `xnli_{lang}` | ✅ | ✅ | xnli | XTREME, multi-lang |

diff --git a/jiant/scripts/download_data/constants.py b/jiant/scripts/download_data/constants.py
@@ -22,3 +22,17 @@
 DIRECT_DOWNLOAD_TASKS = set(
     list(SQUAD_TASKS) + list(DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS) + list(OTHER_DOWNLOAD_TASKS)
 )
+OTHER_HF_DATASETS_TASKS = {
+    "snli",
+    "commonsenseqa",
+    "hellaswag",
+    "cosmosqa",
+    "socialiqa",
+    "scitail",
+    "quoref",
+    "adversarial_nli_r1",
+    "adversarial_nli_r2",
+    "adversarial_nli_r3",
+    "arc_easy",
+    "arc_challenge",
+}
diff --git a/jiant/scripts/download_data/runscript.py b/jiant/scripts/download_data/runscript.py
@@ -8,11 +8,14 @@
 from jiant.tasks.constants import (
     GLUE_TASKS,
     SUPERGLUE_TASKS,
-    OTHER_HF_DATASETS_TASKS,
     XTREME_TASKS,
     BENCHMARKS,
 )
-from jiant.scripts.download_data.constants import SQUAD_TASKS, DIRECT_DOWNLOAD_TASKS
+from jiant.scripts.download_data.constants import (
+    SQUAD_TASKS,
+    DIRECT_DOWNLOAD_TASKS,
+    OTHER_HF_DATASETS_TASKS,
+)
 
 # DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the HF Datasets
 # implementation differs from the original dataset format

diff --git a/jiant/tasks/constants.py b/jiant/tasks/constants.py
@@ -25,21 +25,6 @@
     "superglue_winogender_diagnostics",
 }
 
-OTHER_HF_DATASETS_TASKS = {
-    "snli",
-    "commonsenseqa",
-    "hellaswag",
-    "cosmosqa",
-    "socialiqa",
-    "scitail",
-    "quoref",
-    "adversarial_nli_r1",
-    "adversarial_nli_r2",
-    "adversarial_nli_r3",
-    "arc_easy",
-    "arc_challenge",
-}
-
 XTREME_TASKS = {
     "xnli",
     "pawsx",