Add global mmlu cards (#1561)

* Added Global-MMLU cards for 42 languages - Implemented subject-specific card generation - Maintained consistency with original MMLU format * Added Global-MMLU cards for 42 languages - Implemented subject-specific card generation - Maintained consistency with original MMLU format * Added Global-MMLU cards for 42 languages - Implemented subject-specific card generation - Maintained consistency with original MMLU format * Added Global-MMLU cards for 42 languages - Implemented subject-specific card generation - Maintained consistency with original MMLU format * Added Global-MMLU cards for 42 languages - Implemented subject-specific card generation - Maintained consistency with original MMLU format * Clean up Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: elronbandel <[email protected]> Co-authored-by: elronbandel <[email protected]>
IBM · Jan 28, 2025 · 49cd166 · 49cd166
1 parent fddf5e3
commit 49cd166
Show file tree

Hide file tree

Showing 2,396 changed files with 174,936 additions and 2 deletions.
diff --git a/prepare/cards/global_mmlu.py b/prepare/cards/global_mmlu.py
@@ -0,0 +1,172 @@
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    Deduplicate,
+    FilterByCondition,
+    ListFieldValues,
+    MapInstanceValues,
+    Set,
+)
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+languages = [
+    "am",
+    "ar",
+    "bn",
+    "cs",
+    "de",
+    "el",
+    "en",
+    "es",
+    "fa",
+    "fil",
+    "fr",
+    "ha",
+    "he",
+    "hi",
+    "id",
+    "ig",
+    "it",
+    "ja",
+    "ko",
+    "ky",
+    "lt",
+    "mg",
+    "ms",
+    "ne",
+    "nl",
+    "ny",
+    "pl",
+    "pt",
+    "ro",
+    "ru",
+    "si",
+    "sn",
+    "so",
+    "sr",
+    "sv",
+    "sw",
+    "te",
+    "tr",
+    "uk",
+    "vi",
+    "yo",
+    "zh",
+]
+subtasks = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
+
+
+is_first = True
+for language in languages:
+    for subject in subtasks:
+        card = TaskCard(
+            loader=LoadHF(path="CohereForAI/Global-MMLU", name=language),
+            preprocess_steps=[
+                FilterByCondition(values={"subject": subject}, condition="eq"),
+                Deduplicate(by=["question", "subject", "answer"]),
+                RenameSplits({"dev": "train"}),
+                MapInstanceValues(
+                    mappers={
+                        "answer": {
+                            "A": 0,
+                            "B": 1,
+                            "C": 2,
+                            "D": 3,
+                        }
+                    }
+                ),
+                ListFieldValues(
+                    fields=["option_a", "option_b", "option_c", "option_d"],
+                    to_field="choices",
+                ),
+                Set({"topic": subject.replace("_", " ")}),
+            ],
+            task="tasks.qa.multiple_choice.with_topic",
+            templates="templates.qa.multiple_choice.with_topic.all",
+            __tags__={
+                "annotations_creators": "expert-generated",
+                "language": language,
+                "language_creators": "expert-generated",
+                "license": "apache-2.0",
+                "multilinguality": "multilingual",
+                "size_categories": "10K<n<100K",
+                "source_datasets": "original",
+                "task_categories": "question-answering",
+                "task_ids": "multiple-choice-qa",
+                "region": "global",
+            },
+            __description__=(
+                "Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations "
+                "for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset "
+                "includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or "
+                "Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external "
+                "contributors from industry and academia. The test spans subjects in humanities, social sciences, hard "
+                "sciences, and other areas. See the full description on the dataset page: "
+                "https://huggingface.co/datasets/CohereForAI/Global-MMLU"
+            ),
+        )
+
+        if is_first:
+            test_card(card, strict=False)
+            is_first = False
+        add_to_catalog(card, f"cards.global_mmlu.{language}.{subject}", overwrite=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -104,7 +104,7 @@ tests = [
     "bs4",
     "tenacity==8.3.0",
     "accelerate",
-    "spacy", 
+    "spacy",
     "func_timeout==4.3.5",
     "Wikipedia-API",
     "sqlglot",
@@ -246,7 +246,7 @@ extend-immutable-calls = ["fastapi.Depends", "fastapi.params.Depends", "fastapi.
 "src".msg = "Use unitxt outside src/ and relative imports inside src/ and install unitxt from source with `pip install -e '.[dev]'`."
 
 [tool.codespell]
-ignore-words-list = 'rouge,ot,ans,nd,cann,som,tha,vie,ment,criterias,atleast'
+ignore-words-list = 'rouge,ot,ans,nd,cann,som,tha,vie,ment,criterias,atleast,te'
 check-filenames = true
 check-hidden = false
 regex = "(?<![a-z])[a-z'`]+|[A-Z][a-z'`]*|[a-z]+'[a-z]*|[a-z]+(?=[_-])|[a-z]+(?=[A-Z])|\\d+"

diff --git a/src/unitxt/catalog/cards/global_mmlu/am/abstract_algebra.json b/src/unitxt/catalog/cards/global_mmlu/am/abstract_algebra.json
@@ -0,0 +1,73 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "CohereForAI/Global-MMLU",
+        "name": "am"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "filter_by_condition",
+            "values": {
+                "subject": "abstract_algebra"
+            },
+            "condition": "eq"
+        },
+        {
+            "__type__": "deduplicate",
+            "by": [
+                "question",
+                "subject",
+                "answer"
+            ]
+        },
+        {
+            "__type__": "rename_splits",
+            "mapper": {
+                "dev": "train"
+            }
+        },
+        {
+            "__type__": "map_instance_values",
+            "mappers": {
+                "answer": {
+                    "A": 0,
+                    "B": 1,
+                    "C": 2,
+                    "D": 3
+                }
+            }
+        },
+        {
+            "__type__": "list_field_values",
+            "fields": [
+                "option_a",
+                "option_b",
+                "option_c",
+                "option_d"
+            ],
+            "to_field": "choices"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "topic": "abstract algebra"
+            }
+        }
+    ],
+    "task": "tasks.qa.multiple_choice.with_topic",
+    "templates": "templates.qa.multiple_choice.with_topic.all",
+    "__tags__": {
+        "annotations_creators": "expert-generated",
+        "language": "am",
+        "language_creators": "expert-generated",
+        "license": "apache-2.0",
+        "multilinguality": "multilingual",
+        "size_categories": "10K<n<100K",
+        "source_datasets": "original",
+        "task_categories": "question-answering",
+        "task_ids": "multiple-choice-qa",
+        "region": "global"
+    },
+    "__description__": "Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external contributors from industry and academia. The test spans subjects in humanities, social sciences, hard sciences, and other areas. See the full description on the dataset page: https://huggingface.co/datasets/CohereForAI/Global-MMLU"
+}
diff --git a/src/unitxt/catalog/cards/global_mmlu/am/anatomy.json b/src/unitxt/catalog/cards/global_mmlu/am/anatomy.json
@@ -0,0 +1,73 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "CohereForAI/Global-MMLU",
+        "name": "am"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "filter_by_condition",
+            "values": {
+                "subject": "anatomy"
+            },
+            "condition": "eq"
+        },
+        {
+            "__type__": "deduplicate",
+            "by": [
+                "question",
+                "subject",
+                "answer"
+            ]
+        },
+        {
+            "__type__": "rename_splits",
+            "mapper": {
+                "dev": "train"
+            }
+        },
+        {
+            "__type__": "map_instance_values",
+            "mappers": {
+                "answer": {
+                    "A": 0,
+                    "B": 1,
+                    "C": 2,
+                    "D": 3
+                }
+            }
+        },
+        {
+            "__type__": "list_field_values",
+            "fields": [
+                "option_a",
+                "option_b",
+                "option_c",
+                "option_d"
+            ],
+            "to_field": "choices"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "topic": "anatomy"
+            }
+        }
+    ],
+    "task": "tasks.qa.multiple_choice.with_topic",
+    "templates": "templates.qa.multiple_choice.with_topic.all",
+    "__tags__": {
+        "annotations_creators": "expert-generated",
+        "language": "am",
+        "language_creators": "expert-generated",
+        "license": "apache-2.0",
+        "multilinguality": "multilingual",
+        "size_categories": "10K<n<100K",
+        "source_datasets": "original",
+        "task_categories": "question-answering",
+        "task_ids": "multiple-choice-qa",
+        "region": "global"
+    },
+    "__description__": "Global-MMLU is a multilingual evaluation set spanning 42 languages, combining machine translations for MMLU questions along with professional translations and crowd-sourced post-edits. The dataset includes cultural sensitivity annotations, classifying questions as Culturally Sensitive (CS) or Culturally Agnostic (CA)️. This initiative was led by Cohere For AI in collaboration with external contributors from industry and academia. The test spans subjects in humanities, social sciences, hard sciences, and other areas. See the full description on the dataset page: https://huggingface.co/datasets/CohereForAI/Global-MMLU"
+}