From 0ac6fe40501d97da98cb10388e34794398ec674e Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 03:10:59 +0800 Subject: [PATCH 01/25] promptsource dependency added --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 9c6d441..1c3c7a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,3 +51,4 @@ wrapt==1.14.1 xxhash==3.2.0 yapf==0.32.0 yarl==1.8.2 +promptsource==0.2.3 From 7ffc88b7e49409cfdd3904bd013dc5eb786600b5 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 03:12:10 +0800 Subject: [PATCH 02/25] Project nlp dataset to a instruction dataset using promptsource. --- data/project_from_psrc.py | 226 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 data/project_from_psrc.py diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py new file mode 100644 index 0000000..aec3751 --- /dev/null +++ b/data/project_from_psrc.py @@ -0,0 +1,226 @@ +import os +import json +import argparse +import datasets +from tqdm import tqdm +import concurrent.futures +from tqdm.contrib.concurrent import process_map +from promptsource.templates import DatasetTemplates + + + +def export_dataset( + dataset_output_dir, + dataset_name, + subset_name, + prompt_template, + prompt, + dataset, + add_source_metadata=False, + highlight_variables=False, +): + splits = list(dataset.keys()) + prompt_name = prompt.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + json_data_path = os.path.join( + json_data_path, + (prompt_template + "." + prompt_name).replace("/", "_").replace(" ", "_") + + ".jsonl", + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, subset_name, split, prompt_template, prompt_name + ), + ): + projected_sample = prompt.apply(sample, highlight_variables=False) + answer_choice_list = prompt.get_answer_choices_list(sample) + if len(projected_sample) != 2: + continue + source, target = projected_sample + projected_sample_with_metadata = { + "id": _id, + "source": source, + "target": target, + "prompt_template": prompt_template, + "prompt_name": prompt_name, + "prompt_answer_choice_list": answer_choice_list, + "dataset_name": dataset_name, + "subset_name": subset_name, + "split": split, + "metrics": prompt.metadata.metrics, + "original_task": prompt.metadata.original_task, + "choices_in_prompt": prompt.metadata.choices_in_prompt, + "languages": prompt.metadata.languages, + } + if highlight_variables: + new_projected_sample = prompt.apply( + sample, highlight_variables=highlight_variables + ) + source, target = new_projected_sample + projected_sample_with_metadata["highlighted_source"] = source + projected_sample_with_metadata["highlighted_target"] = target + + if add_source_metadata: + for k, v in sample.items(): + k = "src_meta_{}".format(k) + assert k not in projected_sample_with_metadata + projected_sample_with_metadata[k] = v + + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) + + +def invoke_none(lst): + for idx, val in enumerate(lst): + if val == "None" or val == "none" or val == "null" or val == "": + lst[idx] = None + return lst + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-name-or-paths", + nargs="+", + default="glue", + help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). + A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates . + In the case of a new dataset, it is possible to apply a different prompt template to to a new dataset it as long as + the JSON structure of the dataset is the same as what is required in the original prompt template.""" + ) + parser.add_argument( + "--dataset-configs", + nargs="+", + default=None, + help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. + If there is no `--dataset-configs` in huggingface, use None. The first argument in the `--dataset-name-or-paths` refers to the + first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`.""" + ) + parser.add_argument( + "--prompt-templates-configs", + nargs="+", + default=None, + help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. + The first argument in the `--dataset-name-or-paths` & `--dataset-configs` refers to the + first argument of the `--prompt-templates-configs`. There should be an equal number of argument in + `--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`""" + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", + ) + parser.add_argument( + "--output-dir", type=str, required=True, + help="Path to the output dir where the projected data will be stored." + ) + parser.add_argument( + "--num-proc", + type=int, + default=9, + help="Total number of parallel process." + ) + parser.add_argument( + "--add-source-metadata", + action="store_true", + help=""" + Add all the metadata from source dataset. This will create new keys names `src_meta_{original_keys}` + where this `original_keys` are all the keys from the original dataset key names (a.k.a column name). + These variable are kept with the completion so that we can recover the projection again if needed. + """, + ) + parser.add_argument( + "--highlight-variables", + action="store_true", + help="""Highlight token that are coming from the prompts and original dataset." + This feature can be use to differentiate prompt tokens and input tokens.""", + ) + args = parser.parse_args() + + assert len(args.dataset_name_or_paths) == len(args.dataset_configs) + assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) + + invoke_none(args.dataset_name_or_paths) + invoke_none(args.dataset_configs) + invoke_none(args.prompt_templates_configs) + + prompted_sample_gen_io_tuple_list = [] + # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. + for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( + args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs + ): + dataset = datasets.load_dataset(dataset_name_or_path, dataset_config, cache_dir=args.cache_dir) + psrc_prompt_template_signature = prompt_template_config + if psrc_prompt_template_signature is None: + if dataset_config is None: + psrc_prompt_template_signature = "{}".format(dataset_name_or_path) + else: + psrc_prompt_template_signature = "{}/{}".format(dataset_name_or_path, dataset_config) + dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) + os.makedirs(dataset_output_dir, exist_ok=True) + if dataset_config is not None: + dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) + os.makedirs(dataset_output_dir, exist_ok=True) + prompt_templates = DatasetTemplates(psrc_prompt_template_signature) + prompt_names = list(prompt_templates.name_to_id_mapping.keys()) + for prompt_name in prompt_names: + prompt_template = prompt_templates[prompt_name] + prompted_sample_gen_io_tuple = (dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset, + args.add_source_metadata, + args.highlight_variables) + prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) + + # Test a single process run + # export_dataset( + # prompted_sample_gen_io_tuple_list[0][0], + # prompted_sample_gen_io_tuple_list[0][1], + # prompted_sample_gen_io_tuple_list[0][2], + # prompted_sample_gen_io_tuple_list[0][3], + # prompted_sample_gen_io_tuple_list[0][4], + # prompted_sample_gen_io_tuple_list[0][5], + # prompted_sample_gen_io_tuple_list[0][6], + # prompted_sample_gen_io_tuple_list[0][7], + # ) + + # Projecting data using multiprocessing. It's recommended to use large number of CPU machine. set up `--num-proc` accrodingly. + num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) + + with concurrent.futures.ProcessPoolExecutor( + max_workers=num_proc + ) as executor: + for _out in tqdm( + executor.map( + export_dataset, + [prompted_sample_gen_io[0] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_output_dir + [prompted_sample_gen_io[1] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_name_or_path + [prompted_sample_gen_io[2] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_config + [prompted_sample_gen_io[3] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # psrc_prompt_template_signature + [prompted_sample_gen_io[4] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # prompt_template + [prompted_sample_gen_io[5] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset + [prompted_sample_gen_io[6] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.add_source_metadata + [prompted_sample_gen_io[7] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.highlight_variables + ), + total=len(args.dataset_name_or_paths), + ): + try: + print(_out) + except Exception as emsg: + print("Exception msg: {}".format(emsg)) + +if __name__ == "__main__": + main() \ No newline at end of file From fb3fbe7bda9052030640c90a12741dc6f179a232 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 03:13:15 +0800 Subject: [PATCH 03/25] sample bash script for running data/project_from_psrc.py --- scripts/project_from_psrc.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 scripts/project_from_psrc.sh diff --git a/scripts/project_from_psrc.sh b/scripts/project_from_psrc.sh new file mode 100644 index 0000000..352e9ee --- /dev/null +++ b/scripts/project_from_psrc.sh @@ -0,0 +1,14 @@ +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_psrc.py \ +--dataset-name-or-paths glue glue glue glue glue \ +--dataset-configs cola sst2 mrpc qqp stsb \ +--prompt-templates-configs None None None None None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 \ No newline at end of file From 73c2610a62661e2cf66fea9bf36fcc82b9c3308a Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 03:15:58 +0800 Subject: [PATCH 04/25] update readme --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index 91925ae..d538d10 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,33 @@ conda activate instructmultilingual pip install -r requirements.txt ``` +## Dataset Projection + +### promptsource + +```shell +DUMP_FOLDER='' # fill this with your desired address +SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_psrc.py \ +--dataset-name-or-paths glue glue glue glue glue \ +--dataset-configs cola sst2 mrpc qqp stsb \ +--prompt-templates-configs None None None None None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 +``` + +See the details of the arguments by, + +```shell +python data/project_from_psrc.py --help +``` + ## Translate ```shell From f14a3d19fe7bca19a37970a0436383efe4047f15 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 04:06:32 +0800 Subject: [PATCH 05/25] sync naming of argument and module var --- data/project_from_psrc.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index aec3751..31a98f8 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -12,8 +12,8 @@ def export_dataset( dataset_output_dir, dataset_name, - subset_name, - prompt_template, + dataset_config, + psrc_prompt_template_signature, prompt, dataset, add_source_metadata=False, @@ -27,7 +27,7 @@ def export_dataset( os.makedirs(json_data_path, exist_ok=True) json_data_path = os.path.join( json_data_path, - (prompt_template + "." + prompt_name).replace("/", "_").replace(" ", "_") + (psrc_prompt_template_signature + "." + prompt_name).replace("/", "_").replace(" ", "_") + ".jsonl", ) with open(json_data_path, "w", encoding="utf-8") as file_ptr: @@ -36,7 +36,7 @@ def export_dataset( enumerate(dataset_split), total=total_num_sample, desc="{}_{}_{}_{}_{}".format( - dataset_name, subset_name, split, prompt_template, prompt_name + dataset_name, dataset_config, split, psrc_prompt_template_signature, prompt_name ), ): projected_sample = prompt.apply(sample, highlight_variables=False) @@ -48,11 +48,11 @@ def export_dataset( "id": _id, "source": source, "target": target, - "prompt_template": prompt_template, + "psrc_prompt_template_signature": psrc_prompt_template_signature, "prompt_name": prompt_name, "prompt_answer_choice_list": answer_choice_list, "dataset_name": dataset_name, - "subset_name": subset_name, + "dataset_config": dataset_config, "split": split, "metrics": prompt.metadata.metrics, "original_task": prompt.metadata.original_task, @@ -143,7 +143,7 @@ def main(): "--highlight-variables", action="store_true", help="""Highlight token that are coming from the prompts and original dataset." - This feature can be use to differentiate prompt tokens and input tokens.""", + This feature can be used to differentiate prompt tokens and input tokens.""", ) args = parser.parse_args() From 93f2bfef3922842982f79dcc5fe55c5f5e96a96c Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Thu, 9 Mar 2023 15:49:36 +0800 Subject: [PATCH 06/25] fix: typo & --help --- data/project_from_psrc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 31a98f8..60e050b 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -94,7 +94,8 @@ def main(): default="glue", help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates . - In the case of a new dataset, it is possible to apply a different prompt template to to a new dataset it as long as + Usually prompt templates are written for a specific datasets. But in the case of a new dataset, + it is possible to apply a different (written for a different dataset) prompt template to a new dataset as long as the JSON structure of the dataset is the same as what is required in the original prompt template.""" ) parser.add_argument( From fe48bcefa0a6bbc26900d9565a44a0af06bc3970 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 06:02:23 +0800 Subject: [PATCH 07/25] code refactoring & cleaning --- data/project_from_psrc.py | 55 ++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 60e050b..d8d5478 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -1,5 +1,6 @@ import os import json +import logging import argparse import datasets from tqdm import tqdm @@ -7,20 +8,20 @@ from tqdm.contrib.concurrent import process_map from promptsource.templates import DatasetTemplates - +logger = logging.getLogger(__name__) def export_dataset( dataset_output_dir, dataset_name, dataset_config, psrc_prompt_template_signature, - prompt, + prompt_template, dataset, add_source_metadata=False, highlight_variables=False, ): splits = list(dataset.keys()) - prompt_name = prompt.get_name() + prompt_name = prompt_template.get_name() for split in splits: dataset_split = dataset[split] json_data_path = os.path.join(dataset_output_dir, split) @@ -39,8 +40,8 @@ def export_dataset( dataset_name, dataset_config, split, psrc_prompt_template_signature, prompt_name ), ): - projected_sample = prompt.apply(sample, highlight_variables=False) - answer_choice_list = prompt.get_answer_choices_list(sample) + projected_sample = prompt_template.apply(sample, highlight_variables=False) + answer_choice_list = prompt_template.get_answer_choices_list(sample) if len(projected_sample) != 2: continue source, target = projected_sample @@ -54,13 +55,13 @@ def export_dataset( "dataset_name": dataset_name, "dataset_config": dataset_config, "split": split, - "metrics": prompt.metadata.metrics, - "original_task": prompt.metadata.original_task, - "choices_in_prompt": prompt.metadata.choices_in_prompt, - "languages": prompt.metadata.languages, + "metrics": prompt_template.metadata.metrics, + "original_task": prompt_template.metadata.original_task, + "choices_in_prompt": prompt_template.metadata.choices_in_prompt, + "languages": prompt_template.metadata.languages, } if highlight_variables: - new_projected_sample = prompt.apply( + new_projected_sample = prompt_template.apply( sample, highlight_variables=highlight_variables ) source, target = new_projected_sample @@ -186,19 +187,9 @@ def main(): args.highlight_variables) prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) - # Test a single process run - # export_dataset( - # prompted_sample_gen_io_tuple_list[0][0], - # prompted_sample_gen_io_tuple_list[0][1], - # prompted_sample_gen_io_tuple_list[0][2], - # prompted_sample_gen_io_tuple_list[0][3], - # prompted_sample_gen_io_tuple_list[0][4], - # prompted_sample_gen_io_tuple_list[0][5], - # prompted_sample_gen_io_tuple_list[0][6], - # prompted_sample_gen_io_tuple_list[0][7], - # ) - - # Projecting data using multiprocessing. It's recommended to use large number of CPU machine. set up `--num-proc` accrodingly. + # Projecting data using multiprocessing. + # It's recommended to use large number of CPU machine if you are projecting multiple dataset. + # set up `--num-proc` accrodingly. num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) with concurrent.futures.ProcessPoolExecutor( @@ -208,20 +199,20 @@ def main(): executor.map( export_dataset, [prompted_sample_gen_io[0] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_output_dir - [prompted_sample_gen_io[1] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_name_or_path - [prompted_sample_gen_io[2] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_config - [prompted_sample_gen_io[3] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # psrc_prompt_template_signature - [prompted_sample_gen_io[4] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # prompt_template - [prompted_sample_gen_io[5] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset - [prompted_sample_gen_io[6] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.add_source_metadata - [prompted_sample_gen_io[7] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.highlight_variables + [prompted_sample_gen_io[1] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_name_or_path + [prompted_sample_gen_io[2] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_config + [prompted_sample_gen_io[3] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # psrc_prompt_template_signature + [prompted_sample_gen_io[4] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # prompt_template + [prompted_sample_gen_io[5] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset + [prompted_sample_gen_io[6] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.add_source_metadata + [prompted_sample_gen_io[7] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.highlight_variables ), total=len(args.dataset_name_or_paths), ): try: - print(_out) + logger.info(_out) except Exception as emsg: - print("Exception msg: {}".format(emsg)) + logger.warning("Exception msg: {}".format(emsg)) if __name__ == "__main__": main() \ No newline at end of file From 95e9b90da551b30570be49d83883e11ee4eac7e3 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 06:28:57 +0800 Subject: [PATCH 08/25] typing added in function arguments --- data/project_from_psrc.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index d8d5478..49ee2aa 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -5,21 +5,23 @@ import datasets from tqdm import tqdm import concurrent.futures +from typing import Type, Union, List, Optional from tqdm.contrib.concurrent import process_map -from promptsource.templates import DatasetTemplates +from promptsource.templates import DatasetTemplates, Template +from datasets import Dataset, DatasetDict, IterableDatasetDict, IterableDataset logger = logging.getLogger(__name__) def export_dataset( - dataset_output_dir, - dataset_name, - dataset_config, - psrc_prompt_template_signature, - prompt_template, - dataset, - add_source_metadata=False, - highlight_variables=False, -): + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + add_source_metadata: bool = False, + highlight_variables: bool = False, +) -> str: splits = list(dataset.keys()) prompt_name = prompt_template.get_name() for split in splits: @@ -79,7 +81,7 @@ def export_dataset( return "Completed:: {} !".format(json_data_path) -def invoke_none(lst): +def invoke_none(lst: List[str]) -> Union[List[str], None]: for idx, val in enumerate(lst): if val == "None" or val == "none" or val == "null" or val == "": lst[idx] = None From e4fc101636aba9e661bbe6b20cecf0f3334c88e0 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 06:57:06 +0800 Subject: [PATCH 09/25] documentation & comment added. --- data/project_from_psrc.py | 58 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 49ee2aa..8793df6 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -22,6 +22,21 @@ def export_dataset( add_source_metadata: bool = False, highlight_variables: bool = False, ) -> str: + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) + + Args: + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. + add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + """ splits = list(dataset.keys()) prompt_name = prompt_template.get_name() for split in splits: @@ -42,27 +57,29 @@ def export_dataset( dataset_name, dataset_config, split, psrc_prompt_template_signature, prompt_name ), ): - projected_sample = prompt_template.apply(sample, highlight_variables=False) - answer_choice_list = prompt_template.get_answer_choices_list(sample) - if len(projected_sample) != 2: + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply(sample, highlight_variables=False) + answer_choice_list = prompt_template.get_answer_choices_list(sample) # set of potential outcomes. + if len(prompted_sample) != 2: # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample continue - source, target = projected_sample + source, target = prompted_sample projected_sample_with_metadata = { - "id": _id, - "source": source, - "target": target, - "psrc_prompt_template_signature": psrc_prompt_template_signature, - "prompt_name": prompt_name, - "prompt_answer_choice_list": answer_choice_list, - "dataset_name": dataset_name, - "dataset_config": dataset_config, - "split": split, - "metrics": prompt_template.metadata.metrics, - "original_task": prompt_template.metadata.original_task, - "choices_in_prompt": prompt_template.metadata.choices_in_prompt, - "languages": prompt_template.metadata.languages, + "id": _id, #An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) + "source": source, # projected input for the language model. This is the instruction. (datatype: string) + "target": target, # projected output for the language model. This is the gold response. (datatype: string) + "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) + "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) + "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) + "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) + "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) + "split": split, # Split name (i.e., train, dev, test) (datatype: string) + "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) + "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) + "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) + "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) } if highlight_variables: + # Add highlight between prompt tokens and dataset tokens. new_projected_sample = prompt_template.apply( sample, highlight_variables=highlight_variables ) @@ -71,6 +88,8 @@ def export_dataset( projected_sample_with_metadata["highlighted_target"] = target if add_source_metadata: + # Take a backup of the data columns of the original dataset. + # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. for k, v in sample.items(): k = "src_meta_{}".format(k) assert k not in projected_sample_with_metadata @@ -82,6 +101,10 @@ def export_dataset( def invoke_none(lst: List[str]) -> Union[List[str], None]: + """ + helper function. + Takes a list of string and replace `None` where needed. + """ for idx, val in enumerate(lst): if val == "None" or val == "none" or val == "null" or val == "": lst[idx] = None @@ -179,6 +202,7 @@ def main(): prompt_names = list(prompt_templates.name_to_id_mapping.keys()) for prompt_name in prompt_names: prompt_template = prompt_templates[prompt_name] + # pre-calculate the arguments for multiprocesssing. prompted_sample_gen_io_tuple = (dataset_output_dir, dataset_name_or_path, dataset_config, From 10b1bedaca7ba5036c493bf4654f42945963ca33 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 06:59:07 +0800 Subject: [PATCH 10/25] formatting --- data/project_from_psrc.py | 457 +++++++++++++++++++++----------------- 1 file changed, 249 insertions(+), 208 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 8793df6..54417b6 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -12,233 +12,274 @@ logger = logging.getLogger(__name__) + def export_dataset( - dataset_output_dir: str, - dataset_name: str, - dataset_config: str, - psrc_prompt_template_signature: str, - prompt_template: Type[Template], - dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], - add_source_metadata: bool = False, - highlight_variables: bool = False, + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + add_source_metadata: bool = False, + highlight_variables: bool = False, ) -> str: - """ - Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), - project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and - writes in the disk (arg: dataset_output_dir) - - Args: - dataset_output_dir (str): Path to the output directory where data will be saved. - dataset_name (str): Name of the hf-dataset. - dataset_config (str): Name of the hf-dataset config. - psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. - prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. - dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. - add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. - add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. - """ - splits = list(dataset.keys()) - prompt_name = prompt_template.get_name() - for split in splits: - dataset_split = dataset[split] - json_data_path = os.path.join(dataset_output_dir, split) - os.makedirs(json_data_path, exist_ok=True) - json_data_path = os.path.join( - json_data_path, - (psrc_prompt_template_signature + "." + prompt_name).replace("/", "_").replace(" ", "_") - + ".jsonl", - ) - with open(json_data_path, "w", encoding="utf-8") as file_ptr: - total_num_sample = len(dataset_split) - for _id, sample in tqdm( - enumerate(dataset_split), - total=total_num_sample, - desc="{}_{}_{}_{}_{}".format( - dataset_name, dataset_config, split, psrc_prompt_template_signature, prompt_name - ), - ): - # Project/transform sample into instruction. - prompted_sample = prompt_template.apply(sample, highlight_variables=False) - answer_choice_list = prompt_template.get_answer_choices_list(sample) # set of potential outcomes. - if len(prompted_sample) != 2: # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample - continue - source, target = prompted_sample - projected_sample_with_metadata = { - "id": _id, #An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) - "source": source, # projected input for the language model. This is the instruction. (datatype: string) - "target": target, # projected output for the language model. This is the gold response. (datatype: string) - "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) - "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) - "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) - "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) - "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) - "split": split, # Split name (i.e., train, dev, test) (datatype: string) - "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) - "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) - "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) - "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) - } - if highlight_variables: - # Add highlight between prompt tokens and dataset tokens. - new_projected_sample = prompt_template.apply( - sample, highlight_variables=highlight_variables - ) - source, target = new_projected_sample - projected_sample_with_metadata["highlighted_source"] = source - projected_sample_with_metadata["highlighted_target"] = target - - if add_source_metadata: - # Take a backup of the data columns of the original dataset. - # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. - for k, v in sample.items(): - k = "src_meta_{}".format(k) - assert k not in projected_sample_with_metadata - projected_sample_with_metadata[k] = v - - file_ptr.write(json.dumps(projected_sample_with_metadata)) - file_ptr.write("\n") - return "Completed:: {} !".format(json_data_path) + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) + + Args: + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. + add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + """ + splits = list(dataset.keys()) + prompt_name = prompt_template.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + json_data_path = os.path.join( + json_data_path, + (psrc_prompt_template_signature + "." + prompt_name) + .replace("/", "_") + .replace(" ", "_") + + ".jsonl", + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, + dataset_config, + split, + psrc_prompt_template_signature, + prompt_name, + ), + ): + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply( + sample, highlight_variables=False + ) + answer_choice_list = prompt_template.get_answer_choices_list( + sample + ) # set of potential outcomes. + if ( + len(prompted_sample) != 2 + ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample + continue + source, target = prompted_sample + projected_sample_with_metadata = { + "id": _id, # An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) + "source": source, # projected input for the language model. This is the instruction. (datatype: string) + "target": target, # projected output for the language model. This is the gold response. (datatype: string) + "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) + "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) + "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) + "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) + "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) + "split": split, # Split name (i.e., train, dev, test) (datatype: string) + "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) + "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) + "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) + "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) + } + if highlight_variables: + # Add highlight between prompt tokens and dataset tokens. + new_projected_sample = prompt_template.apply( + sample, highlight_variables=highlight_variables + ) + source, target = new_projected_sample + projected_sample_with_metadata["highlighted_source"] = source + projected_sample_with_metadata["highlighted_target"] = target + + if add_source_metadata: + # Take a backup of the data columns of the original dataset. + # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. + for k, v in sample.items(): + k = "src_meta_{}".format(k) + assert k not in projected_sample_with_metadata + projected_sample_with_metadata[k] = v + + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) def invoke_none(lst: List[str]) -> Union[List[str], None]: - """ - helper function. - Takes a list of string and replace `None` where needed. - """ - for idx, val in enumerate(lst): - if val == "None" or val == "none" or val == "null" or val == "": - lst[idx] = None - return lst + """ + helper function. + Takes a list of string and replace `None` where needed. + """ + for idx, val in enumerate(lst): + if val == "None" or val == "none" or val == "null" or val == "": + lst[idx] = None + return lst def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset-name-or-paths", - nargs="+", - default="glue", - help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-name-or-paths", + nargs="+", + default="glue", + help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates . Usually prompt templates are written for a specific datasets. But in the case of a new dataset, it is possible to apply a different (written for a different dataset) prompt template to a new dataset as long as - the JSON structure of the dataset is the same as what is required in the original prompt template.""" - ) - parser.add_argument( - "--dataset-configs", - nargs="+", - default=None, - help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. + the JSON structure of the dataset is the same as what is required in the original prompt template.""", + ) + parser.add_argument( + "--dataset-configs", + nargs="+", + default=None, + help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. If there is no `--dataset-configs` in huggingface, use None. The first argument in the `--dataset-name-or-paths` refers to the - first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`.""" - ) - parser.add_argument( - "--prompt-templates-configs", - nargs="+", - default=None, - help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. + first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`.""", + ) + parser.add_argument( + "--prompt-templates-configs", + nargs="+", + default=None, + help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. The first argument in the `--dataset-name-or-paths` & `--dataset-configs` refers to the first argument of the `--prompt-templates-configs`. There should be an equal number of argument in - `--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`""" - ) - parser.add_argument( - "--cache-dir", - type=str, - required=True, - help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", - ) - parser.add_argument( - "--output-dir", type=str, required=True, - help="Path to the output dir where the projected data will be stored." - ) - parser.add_argument( - "--num-proc", - type=int, - default=9, - help="Total number of parallel process." - ) - parser.add_argument( - "--add-source-metadata", - action="store_true", - help=""" + `--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`""", + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Path to the output dir where the projected data will be stored.", + ) + parser.add_argument( + "--num-proc", type=int, default=9, help="Total number of parallel process." + ) + parser.add_argument( + "--add-source-metadata", + action="store_true", + help=""" Add all the metadata from source dataset. This will create new keys names `src_meta_{original_keys}` where this `original_keys` are all the keys from the original dataset key names (a.k.a column name). These variable are kept with the completion so that we can recover the projection again if needed. """, - ) - parser.add_argument( - "--highlight-variables", - action="store_true", - help="""Highlight token that are coming from the prompts and original dataset." + ) + parser.add_argument( + "--highlight-variables", + action="store_true", + help="""Highlight token that are coming from the prompts and original dataset." This feature can be used to differentiate prompt tokens and input tokens.""", - ) - args = parser.parse_args() - - assert len(args.dataset_name_or_paths) == len(args.dataset_configs) - assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) - - invoke_none(args.dataset_name_or_paths) - invoke_none(args.dataset_configs) - invoke_none(args.prompt_templates_configs) - - prompted_sample_gen_io_tuple_list = [] - # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. - for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( - args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs - ): - dataset = datasets.load_dataset(dataset_name_or_path, dataset_config, cache_dir=args.cache_dir) - psrc_prompt_template_signature = prompt_template_config - if psrc_prompt_template_signature is None: - if dataset_config is None: - psrc_prompt_template_signature = "{}".format(dataset_name_or_path) - else: - psrc_prompt_template_signature = "{}/{}".format(dataset_name_or_path, dataset_config) - dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) - os.makedirs(dataset_output_dir, exist_ok=True) - if dataset_config is not None: - dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) - os.makedirs(dataset_output_dir, exist_ok=True) - prompt_templates = DatasetTemplates(psrc_prompt_template_signature) - prompt_names = list(prompt_templates.name_to_id_mapping.keys()) - for prompt_name in prompt_names: - prompt_template = prompt_templates[prompt_name] - # pre-calculate the arguments for multiprocesssing. - prompted_sample_gen_io_tuple = (dataset_output_dir, - dataset_name_or_path, - dataset_config, - psrc_prompt_template_signature, - prompt_template, - dataset, - args.add_source_metadata, - args.highlight_variables) - prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) - - # Projecting data using multiprocessing. - # It's recommended to use large number of CPU machine if you are projecting multiple dataset. - # set up `--num-proc` accrodingly. - num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) - - with concurrent.futures.ProcessPoolExecutor( - max_workers=num_proc - ) as executor: - for _out in tqdm( - executor.map( - export_dataset, - [prompted_sample_gen_io[0] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_output_dir - [prompted_sample_gen_io[1] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_name_or_path - [prompted_sample_gen_io[2] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_config - [prompted_sample_gen_io[3] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # psrc_prompt_template_signature - [prompted_sample_gen_io[4] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # prompt_template - [prompted_sample_gen_io[5] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset - [prompted_sample_gen_io[6] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.add_source_metadata - [prompted_sample_gen_io[7] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.highlight_variables - ), - total=len(args.dataset_name_or_paths), - ): - try: - logger.info(_out) - except Exception as emsg: - logger.warning("Exception msg: {}".format(emsg)) + ) + args = parser.parse_args() + + assert len(args.dataset_name_or_paths) == len(args.dataset_configs) + assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) + + invoke_none(args.dataset_name_or_paths) + invoke_none(args.dataset_configs) + invoke_none(args.prompt_templates_configs) + + prompted_sample_gen_io_tuple_list = [] + # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. + for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( + args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs + ): + dataset = datasets.load_dataset( + dataset_name_or_path, dataset_config, cache_dir=args.cache_dir + ) + psrc_prompt_template_signature = prompt_template_config + if psrc_prompt_template_signature is None: + if dataset_config is None: + psrc_prompt_template_signature = "{}".format(dataset_name_or_path) + else: + psrc_prompt_template_signature = "{}/{}".format( + dataset_name_or_path, dataset_config + ) + dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) + os.makedirs(dataset_output_dir, exist_ok=True) + if dataset_config is not None: + dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) + os.makedirs(dataset_output_dir, exist_ok=True) + prompt_templates = DatasetTemplates(psrc_prompt_template_signature) + prompt_names = list(prompt_templates.name_to_id_mapping.keys()) + for prompt_name in prompt_names: + prompt_template = prompt_templates[prompt_name] + # pre-calculate the arguments for multiprocesssing. + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset, + args.add_source_metadata, + args.highlight_variables, + ) + prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) + + # Projecting data using multiprocessing. + # It's recommended to use large number of CPU machine if you are projecting multiple dataset. + # set up `--num-proc` accrodingly. + num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) + + with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as executor: + for _out in tqdm( + executor.map( + export_dataset, + [ + prompted_sample_gen_io[0] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_output_dir + [ + prompted_sample_gen_io[1] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_name_or_path + [ + prompted_sample_gen_io[2] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_config + [ + prompted_sample_gen_io[3] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # psrc_prompt_template_signature + [ + prompted_sample_gen_io[4] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # prompt_template + [ + prompted_sample_gen_io[5] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset + [ + prompted_sample_gen_io[6] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # args.add_source_metadata + [ + prompted_sample_gen_io[7] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # args.highlight_variables + ), + total=len(args.dataset_name_or_paths), + ): + try: + logger.info(_out) + except Exception as emsg: + logger.warning("Exception msg: {}".format(emsg)) + if __name__ == "__main__": - main() \ No newline at end of file + main() From dcd4df0bcfbb35712d1d64bbb82e7c171f330021 Mon Sep 17 00:00:00 2001 From: M Saiful Bari <32699797+sbmaruf@users.noreply.github.com> Date: Fri, 10 Mar 2023 05:01:37 +0600 Subject: [PATCH 11/25] Update README.md Co-authored-by: Amr Kayid --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d538d10..83b3675 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ pip install -r requirements.txt ## Dataset Projection -### promptsource +### [PromptSource](https://github.com/bigscience-workshop/promptsource) ```shell DUMP_FOLDER='' # fill this with your desired address From b89d224bed616f3be3f4b07128001dfeb10184fb Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 07:06:23 +0800 Subject: [PATCH 12/25] typo \t cleaned --- data/project_from_psrc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 54417b6..40e6134 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -36,7 +36,7 @@ def export_dataset( prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. - add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. """ splits = list(dataset.keys()) prompt_name = prompt_template.get_name() From 52e8c44c0e165b14823b6420242cb26881b2181b Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Fri, 10 Mar 2023 07:07:27 +0800 Subject: [PATCH 13/25] formatting --- data/project_from_psrc.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 40e6134..4974148 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -29,14 +29,14 @@ def export_dataset( writes in the disk (arg: dataset_output_dir) Args: - dataset_output_dir (str): Path to the output directory where data will be saved. - dataset_name (str): Name of the hf-dataset. - dataset_config (str): Name of the hf-dataset config. - psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. - prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. - dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. - add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. - add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. + add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. """ splits = list(dataset.keys()) prompt_name = prompt_template.get_name() From 50491b9ed1373b08d450d9591269786850df23f9 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 19 Apr 2023 04:10:24 +0800 Subject: [PATCH 14/25] add xp3 format output option. --- data/project_from_psrc.py | 548 ++++++++++++++++++++++---------------- 1 file changed, 319 insertions(+), 229 deletions(-) diff --git a/data/project_from_psrc.py b/data/project_from_psrc.py index 4974148..cc5eb9c 100644 --- a/data/project_from_psrc.py +++ b/data/project_from_psrc.py @@ -14,21 +14,22 @@ def export_dataset( - dataset_output_dir: str, - dataset_name: str, - dataset_config: str, - psrc_prompt_template_signature: str, - prompt_template: Type[Template], - dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], - add_source_metadata: bool = False, - highlight_variables: bool = False, + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + add_source_metadata: bool = False, + highlight_variables: bool = False, + lang: str = 'en' ) -> str: - """ - Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), - project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and - writes in the disk (arg: dataset_output_dir) + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) - Args: + Args: dataset_output_dir (str): Path to the output directory where data will be saved. dataset_name (str): Name of the hf-dataset. dataset_config (str): Name of the hf-dataset config. @@ -37,249 +38,338 @@ def export_dataset( dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. - """ - splits = list(dataset.keys()) - prompt_name = prompt_template.get_name() - for split in splits: - dataset_split = dataset[split] - json_data_path = os.path.join(dataset_output_dir, split) - os.makedirs(json_data_path, exist_ok=True) - json_data_path = os.path.join( - json_data_path, - (psrc_prompt_template_signature + "." + prompt_name) - .replace("/", "_") - .replace(" ", "_") - + ".jsonl", - ) - with open(json_data_path, "w", encoding="utf-8") as file_ptr: - total_num_sample = len(dataset_split) - for _id, sample in tqdm( - enumerate(dataset_split), - total=total_num_sample, - desc="{}_{}_{}_{}_{}".format( - dataset_name, - dataset_config, - split, - psrc_prompt_template_signature, - prompt_name, - ), - ): - # Project/transform sample into instruction. - prompted_sample = prompt_template.apply( - sample, highlight_variables=False - ) - answer_choice_list = prompt_template.get_answer_choices_list( - sample - ) # set of potential outcomes. - if ( - len(prompted_sample) != 2 - ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample - continue - source, target = prompted_sample - projected_sample_with_metadata = { - "id": _id, # An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) - "source": source, # projected input for the language model. This is the instruction. (datatype: string) - "target": target, # projected output for the language model. This is the gold response. (datatype: string) - "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) - "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) - "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) - "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) - "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) - "split": split, # Split name (i.e., train, dev, test) (datatype: string) - "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) - "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) - "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) - "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) - } - if highlight_variables: - # Add highlight between prompt tokens and dataset tokens. - new_projected_sample = prompt_template.apply( - sample, highlight_variables=highlight_variables - ) - source, target = new_projected_sample - projected_sample_with_metadata["highlighted_source"] = source - projected_sample_with_metadata["highlighted_target"] = target + lang (str = 'en'): language name of the dataset + """ + splits = list(dataset.keys()) + prompt_name = prompt_template.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + json_data_path = os.path.join( + json_data_path, + (psrc_prompt_template_signature + "." + prompt_name) + .replace("/", "_") + .replace(" ", "_") + + f"_{lang}.jsonl", + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, + dataset_config, + split, + psrc_prompt_template_signature, + prompt_name, + ), + ): + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply( + sample, highlight_variables=False + ) + answer_choice_list = prompt_template.get_answer_choices_list( + sample + ) # set of potential outcomes. + if ( + len(prompted_sample) != 2 + ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample + continue + source, target = prompted_sample + projected_sample_with_metadata = { + "id": _id, # An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) + "source": source, # projected input for the language model. This is the instruction. (datatype: string) + "target": target, # projected output for the language model. This is the gold response. (datatype: string) + "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) + "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) + "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) + "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) + "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) + "split": split, # Split name (i.e., train, dev, test) (datatype: string) + "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) + "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) + "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) + "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) + } + if highlight_variables: + # Add highlight between prompt tokens and dataset tokens. + new_projected_sample = prompt_template.apply( + sample, highlight_variables=highlight_variables + ) + source, target = new_projected_sample + projected_sample_with_metadata["highlighted_source"] = source + projected_sample_with_metadata["highlighted_target"] = target - if add_source_metadata: - # Take a backup of the data columns of the original dataset. - # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. - for k, v in sample.items(): - k = "src_meta_{}".format(k) - assert k not in projected_sample_with_metadata - projected_sample_with_metadata[k] = v + if add_source_metadata: + # Take a backup of the data columns of the original dataset. + # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. + for k, v in sample.items(): + k = "src_meta_{}".format(k) + assert k not in projected_sample_with_metadata + projected_sample_with_metadata[k] = v - file_ptr.write(json.dumps(projected_sample_with_metadata)) - file_ptr.write("\n") - return "Completed:: {} !".format(json_data_path) + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) + + +def xp3_export_dataset( + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + add_source_metadata: bool = False, + highlight_variables: bool = False, + lang: str = 'en' +) -> str: + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) + + Args: + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. + add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + lang (str = 'en'): language name of the dataset + """ + splits = list(dataset.keys()) + prompt_name = prompt_template.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + json_data_path = os.path.join( + json_data_path, + f"foraiml_{dataset_name}_{lang}_{prompt_name}.jsonl" + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, + dataset_config, + split, + psrc_prompt_template_signature, + prompt_name, + ), + ): + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply( + sample, highlight_variables=False + ) + answer_choice_list = prompt_template.get_answer_choices_list( + sample + ) # set of potential outcomes. + if ( + len(prompted_sample) != 2 + ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample + continue + source, target = prompted_sample + projected_sample_with_metadata = { + "inputs": source, # projected input for the language model. This is the instruction. (datatype: string) + "targets": target, # projected output for the language model. This is the gold response. (datatype: string) + } + + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) def invoke_none(lst: List[str]) -> Union[List[str], None]: - """ - helper function. - Takes a list of string and replace `None` where needed. - """ - for idx, val in enumerate(lst): - if val == "None" or val == "none" or val == "null" or val == "": - lst[idx] = None - return lst + """ + helper function. + Takes a list of string and replace `None` where needed. + """ + for idx, val in enumerate(lst): + if val == "None" or val == "none" or val == "null" or val == "": + lst[idx] = None + return lst def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset-name-or-paths", - nargs="+", - default="glue", - help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-name-or-paths", + nargs="+", + default="glue", + help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates . Usually prompt templates are written for a specific datasets. But in the case of a new dataset, it is possible to apply a different (written for a different dataset) prompt template to a new dataset as long as the JSON structure of the dataset is the same as what is required in the original prompt template.""", - ) - parser.add_argument( - "--dataset-configs", - nargs="+", - default=None, - help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. + ) + parser.add_argument( + "--dataset-configs", + nargs="+", + default=None, + help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. If there is no `--dataset-configs` in huggingface, use None. The first argument in the `--dataset-name-or-paths` refers to the first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`.""", - ) - parser.add_argument( - "--prompt-templates-configs", - nargs="+", - default=None, - help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. + ) + parser.add_argument( + "--prompt-templates-configs", + nargs="+", + default=None, + help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. The first argument in the `--dataset-name-or-paths` & `--dataset-configs` refers to the first argument of the `--prompt-templates-configs`. There should be an equal number of argument in `--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`""", - ) - parser.add_argument( - "--cache-dir", - type=str, - required=True, - help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", - ) - parser.add_argument( - "--output-dir", - type=str, - required=True, - help="Path to the output dir where the projected data will be stored.", - ) - parser.add_argument( - "--num-proc", type=int, default=9, help="Total number of parallel process." - ) - parser.add_argument( - "--add-source-metadata", - action="store_true", - help=""" + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Path to the output dir where the projected data will be stored.", + ) + parser.add_argument( + "--num-proc", type=int, default=9, help="Total number of parallel process." + ) + parser.add_argument( + "--add-source-metadata", + action="store_true", + help=""" Add all the metadata from source dataset. This will create new keys names `src_meta_{original_keys}` where this `original_keys` are all the keys from the original dataset key names (a.k.a column name). These variable are kept with the completion so that we can recover the projection again if needed. """, - ) - parser.add_argument( - "--highlight-variables", - action="store_true", - help="""Highlight token that are coming from the prompts and original dataset." + ) + parser.add_argument( + "--highlight-variables", + action="store_true", + help="""Highlight token that are coming from the prompts and original dataset." This feature can be used to differentiate prompt tokens and input tokens.""", - ) - args = parser.parse_args() - - assert len(args.dataset_name_or_paths) == len(args.dataset_configs) - assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) + ) + parser.add_argument( + "--xp3-format", + action="store_true", + help="""Export the data in xP3 format""", + ) + parser.add_argument( + "--lang", + type=str, + default='en', + help="""Language name. Required for xP3 naming of the file.""", + ) + args = parser.parse_args() - invoke_none(args.dataset_name_or_paths) - invoke_none(args.dataset_configs) - invoke_none(args.prompt_templates_configs) + assert len(args.dataset_name_or_paths) == len(args.dataset_configs) + assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) + export_dataset_func = xp3_export_dataset if args.xp3_format else export_dataset + if args.xp3_format and args.highlight_variables: + print(f"Ignoring {args.highlight_variables=} since {args.xp3_format}") + if args.xp3_format and args.add_source_metadata: + print(f"Ignoring {args.add_source_metadata=} since {args.xp3_format}") + + invoke_none(args.dataset_name_or_paths) + invoke_none(args.dataset_configs) + invoke_none(args.prompt_templates_configs) - prompted_sample_gen_io_tuple_list = [] - # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. - for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( - args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs - ): - dataset = datasets.load_dataset( - dataset_name_or_path, dataset_config, cache_dir=args.cache_dir - ) - psrc_prompt_template_signature = prompt_template_config - if psrc_prompt_template_signature is None: - if dataset_config is None: - psrc_prompt_template_signature = "{}".format(dataset_name_or_path) - else: - psrc_prompt_template_signature = "{}/{}".format( - dataset_name_or_path, dataset_config - ) - dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) - os.makedirs(dataset_output_dir, exist_ok=True) - if dataset_config is not None: - dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) - os.makedirs(dataset_output_dir, exist_ok=True) - prompt_templates = DatasetTemplates(psrc_prompt_template_signature) - prompt_names = list(prompt_templates.name_to_id_mapping.keys()) - for prompt_name in prompt_names: - prompt_template = prompt_templates[prompt_name] - # pre-calculate the arguments for multiprocesssing. - prompted_sample_gen_io_tuple = ( - dataset_output_dir, - dataset_name_or_path, - dataset_config, - psrc_prompt_template_signature, - prompt_template, - dataset, - args.add_source_metadata, - args.highlight_variables, - ) - prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) + prompted_sample_gen_io_tuple_list = [] + # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. + for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( + args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs + ): + dataset = datasets.load_dataset( + dataset_name_or_path, dataset_config, cache_dir=args.cache_dir + ) + psrc_prompt_template_signature = prompt_template_config + if psrc_prompt_template_signature is None: + if dataset_config is None: + psrc_prompt_template_signature = "{}".format(dataset_name_or_path) + else: + psrc_prompt_template_signature = "{}/{}".format( + dataset_name_or_path, dataset_config + ) + dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) + os.makedirs(dataset_output_dir, exist_ok=True) + if dataset_config is not None: + dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) + os.makedirs(dataset_output_dir, exist_ok=True) + prompt_templates = DatasetTemplates(psrc_prompt_template_signature) + prompt_names = list(prompt_templates.name_to_id_mapping.keys()) + for prompt_name in prompt_names: + prompt_template = prompt_templates[prompt_name] + # pre-calculate the arguments for multiprocesssing. + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset, + args.add_source_metadata, + args.highlight_variables, + ) + prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) - # Projecting data using multiprocessing. - # It's recommended to use large number of CPU machine if you are projecting multiple dataset. - # set up `--num-proc` accrodingly. - num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) + # Projecting data using multiprocessing. + # It's recommended to use large number of CPU machine if you are projecting multiple dataset. + # set up `--num-proc` accrodingly. + num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) - with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as executor: - for _out in tqdm( - executor.map( - export_dataset, - [ - prompted_sample_gen_io[0] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_output_dir - [ - prompted_sample_gen_io[1] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_name_or_path - [ - prompted_sample_gen_io[2] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_config - [ - prompted_sample_gen_io[3] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # psrc_prompt_template_signature - [ - prompted_sample_gen_io[4] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # prompt_template - [ - prompted_sample_gen_io[5] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset - [ - prompted_sample_gen_io[6] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # args.add_source_metadata - [ - prompted_sample_gen_io[7] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # args.highlight_variables - ), - total=len(args.dataset_name_or_paths), - ): - try: - logger.info(_out) - except Exception as emsg: - logger.warning("Exception msg: {}".format(emsg)) + with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as executor: + for _out in tqdm( + executor.map( + export_dataset_func, + [ + prompted_sample_gen_io[0] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_output_dir + [ + prompted_sample_gen_io[1] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_name_or_path + [ + prompted_sample_gen_io[2] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset_config + [ + prompted_sample_gen_io[3] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # psrc_prompt_template_signature + [ + prompted_sample_gen_io[4] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # prompt_template + [ + prompted_sample_gen_io[5] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # dataset + [ + prompted_sample_gen_io[6] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # args.add_source_metadata + [ + prompted_sample_gen_io[7] + for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list + ], # args.highlight_variables + ), + total=len(args.dataset_name_or_paths), + ): + try: + logger.info(_out) + except Exception as emsg: + logger.warning("Exception msg: {}".format(emsg)) if __name__ == "__main__": - main() + main() From e0846ce8e603ed6164380eb99e7748d5d04860b2 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 19 Apr 2023 04:10:49 +0800 Subject: [PATCH 15/25] xp3 option example --- scripts/project_from_psrc.sh | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/project_from_psrc.sh b/scripts/project_from_psrc.sh index 352e9ee..adbc8ad 100644 --- a/scripts/project_from_psrc.sh +++ b/scripts/project_from_psrc.sh @@ -1,14 +1,30 @@ +# The native format with a lots of metadata DUMP_FOLDER='./raw' SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc mkdir -p $SRC_DATA_FOLDER mkdir -p $SRC_DATA_FOLDER/cache python data/project_from_psrc.py \ ---dataset-name-or-paths glue glue glue glue glue \ ---dataset-configs cola sst2 mrpc qqp stsb \ ---prompt-templates-configs None None None None None \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ --cache-dir $SRC_DATA_FOLDER/cache \ --output-dir $SRC_DATA_FOLDER \ --highlight-variables \ --add-source-metadata \ +--num-proc 16 + +# The xP3 format +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_psrc.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--xp3-format \ --num-proc 16 \ No newline at end of file From fc3ad729764386181fb1b1ae64633d72f632288f Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:45:12 +0800 Subject: [PATCH 16/25] update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b6e4761..ef9df96 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,4 @@ dmypy.json # Pyre type checker .pyre/ +.DS_Store \ No newline at end of file From 6c31e260556fe0b02e763693061b95db5cb3009a Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 20:45:37 +0800 Subject: [PATCH 17/25] rename file --- data/{project_from_psrc.py => project_from_promptsource.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data/{project_from_psrc.py => project_from_promptsource.py} (100%) diff --git a/data/project_from_psrc.py b/data/project_from_promptsource.py similarity index 100% rename from data/project_from_psrc.py rename to data/project_from_promptsource.py From 32e473be578af436eb5147a495a77545a58a923d Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 21:10:01 +0800 Subject: [PATCH 18/25] example script renamed --- scripts/project_from_promptsource.sh | 30 ++++++++++++++++++++++++++++ scripts/project_from_psrc.sh | 30 ---------------------------- 2 files changed, 30 insertions(+), 30 deletions(-) create mode 100644 scripts/project_from_promptsource.sh delete mode 100644 scripts/project_from_psrc.sh diff --git a/scripts/project_from_promptsource.sh b/scripts/project_from_promptsource.sh new file mode 100644 index 0000000..33bb2bb --- /dev/null +++ b/scripts/project_from_promptsource.sh @@ -0,0 +1,30 @@ +# The native format with a lots of metadata +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_promptsource.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 + +# # The xP3 format +# DUMP_FOLDER='./raw' +# SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +# mkdir -p $SRC_DATA_FOLDER +# mkdir -p $SRC_DATA_FOLDER/cache + +# python data/project_from_promptsource.py \ +# --dataset-name-or-paths nq_open \ +# --dataset-configs None \ +# --prompt-templates-configs None \ +# --cache-dir $SRC_DATA_FOLDER/cache \ +# --output-dir $SRC_DATA_FOLDER \ +# --xp3-format \ +# --num-proc 16 \ No newline at end of file diff --git a/scripts/project_from_psrc.sh b/scripts/project_from_psrc.sh deleted file mode 100644 index adbc8ad..0000000 --- a/scripts/project_from_psrc.sh +++ /dev/null @@ -1,30 +0,0 @@ -# The native format with a lots of metadata -DUMP_FOLDER='./raw' -SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc -mkdir -p $SRC_DATA_FOLDER -mkdir -p $SRC_DATA_FOLDER/cache - -python data/project_from_psrc.py \ ---dataset-name-or-paths nq_open \ ---dataset-configs None \ ---prompt-templates-configs None \ ---cache-dir $SRC_DATA_FOLDER/cache \ ---output-dir $SRC_DATA_FOLDER \ ---highlight-variables \ ---add-source-metadata \ ---num-proc 16 - -# The xP3 format -DUMP_FOLDER='./raw' -SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc -mkdir -p $SRC_DATA_FOLDER -mkdir -p $SRC_DATA_FOLDER/cache - -python data/project_from_psrc.py \ ---dataset-name-or-paths nq_open \ ---dataset-configs None \ ---prompt-templates-configs None \ ---cache-dir $SRC_DATA_FOLDER/cache \ ---output-dir $SRC_DATA_FOLDER \ ---xp3-format \ ---num-proc 16 \ No newline at end of file From 706a7dcedd224e4f9bb92664886ca1f4596fcc37 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Sun, 30 Apr 2023 21:10:49 +0800 Subject: [PATCH 19/25] Fix argument passing in executor.map for export_dataset_func --- data/project_from_promptsource.py | 37 +------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py index cc5eb9c..29481f7 100644 --- a/data/project_from_promptsource.py +++ b/data/project_from_promptsource.py @@ -325,44 +325,9 @@ def main(): # It's recommended to use large number of CPU machine if you are projecting multiple dataset. # set up `--num-proc` accrodingly. num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) - with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as executor: for _out in tqdm( - executor.map( - export_dataset_func, - [ - prompted_sample_gen_io[0] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_output_dir - [ - prompted_sample_gen_io[1] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_name_or_path - [ - prompted_sample_gen_io[2] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset_config - [ - prompted_sample_gen_io[3] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # psrc_prompt_template_signature - [ - prompted_sample_gen_io[4] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # prompt_template - [ - prompted_sample_gen_io[5] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # dataset - [ - prompted_sample_gen_io[6] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # args.add_source_metadata - [ - prompted_sample_gen_io[7] - for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list - ], # args.highlight_variables - ), + executor.map(export_dataset_func, *zip(*prompted_sample_gen_io_tuple_list)), total=len(args.dataset_name_or_paths), ): try: From 9fdceca95c9da36358f94c790f6bc6185b86c76e Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:23:51 +0800 Subject: [PATCH 20/25] update doc-string --- data/project_from_promptsource.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py index 29481f7..fe3c81c 100644 --- a/data/project_from_promptsource.py +++ b/data/project_from_promptsource.py @@ -37,7 +37,7 @@ def export_dataset( prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. - add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + highlight_variables (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. lang (str = 'en'): language name of the dataset """ splits = list(dataset.keys()) @@ -122,8 +122,6 @@ def xp3_export_dataset( psrc_prompt_template_signature: str, prompt_template: Type[Template], dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], - add_source_metadata: bool = False, - highlight_variables: bool = False, lang: str = 'en' ) -> str: """ @@ -138,8 +136,6 @@ def xp3_export_dataset( psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. - add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. - add_source_metadata (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. lang (str = 'en'): language name of the dataset """ splits = list(dataset.keys()) From afe31357c2d9ce2146a85a93227a9272fe8f28b8 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:27:09 +0800 Subject: [PATCH 21/25] bug-fix: func argument --- data/project_from_promptsource.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py index fe3c81c..9d030f1 100644 --- a/data/project_from_promptsource.py +++ b/data/project_from_promptsource.py @@ -305,16 +305,26 @@ def main(): for prompt_name in prompt_names: prompt_template = prompt_templates[prompt_name] # pre-calculate the arguments for multiprocesssing. - prompted_sample_gen_io_tuple = ( - dataset_output_dir, - dataset_name_or_path, - dataset_config, - psrc_prompt_template_signature, - prompt_template, - dataset, - args.add_source_metadata, - args.highlight_variables, - ) + if args.xp3_format: + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset + ) + else: + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset, + args.add_source_metadata, + args.highlight_variables, + ) prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) # Projecting data using multiprocessing. From f82ad938ec32e128758c8e2e334133ea0ccd33c9 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:40:25 +0800 Subject: [PATCH 22/25] naming improvement --- data/project_from_promptsource.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py index 9d030f1..568635d 100644 --- a/data/project_from_promptsource.py +++ b/data/project_from_promptsource.py @@ -144,9 +144,11 @@ def xp3_export_dataset( dataset_split = dataset[split] json_data_path = os.path.join(dataset_output_dir, split) os.makedirs(json_data_path, exist_ok=True) + __simp_dataset_name = dataset_name.replace("/", "_") + __simp_prompt_name = prompt_name.replace("/", "_") json_data_path = os.path.join( json_data_path, - f"foraiml_{dataset_name}_{lang}_{prompt_name}.jsonl" + f"foraiml_{__simp_dataset_name}_{lang}_{__simp_prompt_name}.jsonl" ) with open(json_data_path, "w", encoding="utf-8") as file_ptr: total_num_sample = len(dataset_split) @@ -295,7 +297,7 @@ def main(): psrc_prompt_template_signature = "{}/{}".format( dataset_name_or_path, dataset_config ) - dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path) + dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path.replace("/", "_")) os.makedirs(dataset_output_dir, exist_ok=True) if dataset_config is not None: dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) From 148a14f847451cba651c75ae5654b76da39cfd4e Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:47:59 +0800 Subject: [PATCH 23/25] update naming & doc-string --- data/project_from_promptsource.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py index 568635d..98edcae 100644 --- a/data/project_from_promptsource.py +++ b/data/project_from_promptsource.py @@ -21,8 +21,7 @@ def export_dataset( prompt_template: Type[Template], dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], add_source_metadata: bool = False, - highlight_variables: bool = False, - lang: str = 'en' + highlight_variables: bool = False ) -> str: """ Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), @@ -38,7 +37,6 @@ def export_dataset( dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. highlight_variables (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. - lang (str = 'en'): language name of the dataset """ splits = list(dataset.keys()) prompt_name = prompt_template.get_name() @@ -51,7 +49,7 @@ def export_dataset( (psrc_prompt_template_signature + "." + prompt_name) .replace("/", "_") .replace(" ", "_") - + f"_{lang}.jsonl", + + ".jsonl", ) with open(json_data_path, "w", encoding="utf-8") as file_ptr: total_num_sample = len(dataset_split) From 18587867b81b469d2212653726e7c495e70bfe00 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:49:01 +0800 Subject: [PATCH 24/25] add custom data gen example --- scripts/project_from_promptsource.sh | 46 ++++++++++++++++++---------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/scripts/project_from_promptsource.sh b/scripts/project_from_promptsource.sh index 33bb2bb..f30d3bc 100644 --- a/scripts/project_from_promptsource.sh +++ b/scripts/project_from_promptsource.sh @@ -1,20 +1,20 @@ -# The native format with a lots of metadata -DUMP_FOLDER='./raw' -SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource -mkdir -p $SRC_DATA_FOLDER -mkdir -p $SRC_DATA_FOLDER/cache +# # The native format with a lots of metadata +# DUMP_FOLDER='./raw' +# SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +# mkdir -p $SRC_DATA_FOLDER +# mkdir -p $SRC_DATA_FOLDER/cache -python data/project_from_promptsource.py \ ---dataset-name-or-paths nq_open \ ---dataset-configs None \ ---prompt-templates-configs None \ ---cache-dir $SRC_DATA_FOLDER/cache \ ---output-dir $SRC_DATA_FOLDER \ ---highlight-variables \ ---add-source-metadata \ ---num-proc 16 +# python data/project_from_promptsource.py \ +# --dataset-name-or-paths nq_open \ +# --dataset-configs None \ +# --prompt-templates-configs None \ +# --cache-dir $SRC_DATA_FOLDER/cache \ +# --output-dir $SRC_DATA_FOLDER \ +# --highlight-variables \ +# --add-source-metadata \ +# --num-proc 16 -# # The xP3 format +# # # The xP3 format # DUMP_FOLDER='./raw' # SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource # mkdir -p $SRC_DATA_FOLDER @@ -27,4 +27,18 @@ python data/project_from_promptsource.py \ # --cache-dir $SRC_DATA_FOLDER/cache \ # --output-dir $SRC_DATA_FOLDER \ # --xp3-format \ -# --num-proc 16 \ No newline at end of file +# --num-proc 16 + +# Project custom data +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_promptsource.py \ +--dataset-name-or-paths shmuhammad/AfriSenti-twitter-sentiment \ +--dataset-configs amh \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--num-proc 16 \ No newline at end of file From 78bbf49fa47212592f0d73544ad9a6852e43f6f5 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Wed, 10 May 2023 07:49:37 +0800 Subject: [PATCH 25/25] uncomment --- scripts/project_from_promptsource.sh | 50 ++++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/scripts/project_from_promptsource.sh b/scripts/project_from_promptsource.sh index f30d3bc..36b644a 100644 --- a/scripts/project_from_promptsource.sh +++ b/scripts/project_from_promptsource.sh @@ -1,33 +1,33 @@ # # The native format with a lots of metadata -# DUMP_FOLDER='./raw' -# SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource -# mkdir -p $SRC_DATA_FOLDER -# mkdir -p $SRC_DATA_FOLDER/cache +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache -# python data/project_from_promptsource.py \ -# --dataset-name-or-paths nq_open \ -# --dataset-configs None \ -# --prompt-templates-configs None \ -# --cache-dir $SRC_DATA_FOLDER/cache \ -# --output-dir $SRC_DATA_FOLDER \ -# --highlight-variables \ -# --add-source-metadata \ -# --num-proc 16 +python data/project_from_promptsource.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 # # # The xP3 format -# DUMP_FOLDER='./raw' -# SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource -# mkdir -p $SRC_DATA_FOLDER -# mkdir -p $SRC_DATA_FOLDER/cache +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache -# python data/project_from_promptsource.py \ -# --dataset-name-or-paths nq_open \ -# --dataset-configs None \ -# --prompt-templates-configs None \ -# --cache-dir $SRC_DATA_FOLDER/cache \ -# --output-dir $SRC_DATA_FOLDER \ -# --xp3-format \ -# --num-proc 16 +python data/project_from_promptsource.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--xp3-format \ +--num-proc 16 # Project custom data DUMP_FOLDER='./raw'