diff --git a/.gitignore b/.gitignore index b6e4761..ef9df96 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,4 @@ dmypy.json # Pyre type checker .pyre/ +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 91925ae..83b3675 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,33 @@ conda activate instructmultilingual pip install -r requirements.txt ``` +## Dataset Projection + +### [PromptSource](https://github.com/bigscience-workshop/promptsource) + +```shell +DUMP_FOLDER='' # fill this with your desired address +SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_psrc.py \ +--dataset-name-or-paths glue glue glue glue glue \ +--dataset-configs cola sst2 mrpc qqp stsb \ +--prompt-templates-configs None None None None None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 +``` + +See the details of the arguments by, + +```shell +python data/project_from_psrc.py --help +``` + ## Translate ```shell diff --git a/data/project_from_promptsource.py b/data/project_from_promptsource.py new file mode 100644 index 0000000..98edcae --- /dev/null +++ b/data/project_from_promptsource.py @@ -0,0 +1,346 @@ +import os +import json +import logging +import argparse +import datasets +from tqdm import tqdm +import concurrent.futures +from typing import Type, Union, List, Optional +from tqdm.contrib.concurrent import process_map +from promptsource.templates import DatasetTemplates, Template +from datasets import Dataset, DatasetDict, IterableDatasetDict, IterableDataset + +logger = logging.getLogger(__name__) + + +def export_dataset( + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + add_source_metadata: bool = False, + highlight_variables: bool = False +) -> str: + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) + + Args: + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + add_source_metadata (bool = False): If True, all the data column from the args:dataset will be saved as a meta information with the instruction dataset. + highlight_variables (bool = False): If True, prompt tokens and dataset tokens will be highlighted differently. This metadata will be saved as `highlighted_source` & `highlighted_target`. + """ + splits = list(dataset.keys()) + prompt_name = prompt_template.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + json_data_path = os.path.join( + json_data_path, + (psrc_prompt_template_signature + "." + prompt_name) + .replace("/", "_") + .replace(" ", "_") + + ".jsonl", + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, + dataset_config, + split, + psrc_prompt_template_signature, + prompt_name, + ), + ): + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply( + sample, highlight_variables=False + ) + answer_choice_list = prompt_template.get_answer_choices_list( + sample + ) # set of potential outcomes. + if ( + len(prompted_sample) != 2 + ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample + continue + source, target = prompted_sample + projected_sample_with_metadata = { + "id": _id, # An unique id for the sample. Each line of the `jsonl` file contains `json` data which has a unique id within the `jsonl` file. (datatype: string/int) + "source": source, # projected input for the language model. This is the instruction. (datatype: string) + "target": target, # projected output for the language model. This is the gold response. (datatype: string) + "psrc_prompt_template_signature": psrc_prompt_template_signature, # prompt template signature from promptsource repository. Usually, a set of prompt templates are written for a task (i.e., glue/cola, glue/mrpc). This usually refers to that task. (datatype: string) + "prompt_name": prompt_name, # Name of the individual prompt template. Under a `psrc_prompt_template_signature` there could be many prompt templates. `prompt_name` refers to each of those prompt templates. (datatype: string) + "prompt_answer_choice_list": answer_choice_list, # Name of all potential outcomes. We often do not have any data for this field. Especially for generative tasks. Only categorical task has this field (i.e., [yes, no], [True, False], [A, B, C, D]). (datatype: list of strings) + "dataset_name": dataset_name, # Name of the huggingface dataset (datatype: string) + "dataset_config": dataset_config, # Subset name of the huggingface dataset (datatype: string) + "split": split, # Split name (i.e., train, dev, test) (datatype: string) + "metrics": prompt_template.metadata.metrics, # metrics to evaluate the response. (datatype: list of strings) + "original_task": prompt_template.metadata.original_task, # If the prompted sample (source, target) refers to the original task for the dataset being created (datatype: True/False) + "choices_in_prompt": prompt_template.metadata.choices_in_prompt, # If there is any randomness in the prompt generation (datatype: list of strings) + "languages": prompt_template.metadata.languages, # The language of the prompt template (not the dataset). (datatype: list of strings) + } + if highlight_variables: + # Add highlight between prompt tokens and dataset tokens. + new_projected_sample = prompt_template.apply( + sample, highlight_variables=highlight_variables + ) + source, target = new_projected_sample + projected_sample_with_metadata["highlighted_source"] = source + projected_sample_with_metadata["highlighted_target"] = target + + if add_source_metadata: + # Take a backup of the data columns of the original dataset. + # This will help us to recover original projection in case we loose track of the generated ones due to various modifications & filters. + for k, v in sample.items(): + k = "src_meta_{}".format(k) + assert k not in projected_sample_with_metadata + projected_sample_with_metadata[k] = v + + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) + + +def xp3_export_dataset( + dataset_output_dir: str, + dataset_name: str, + dataset_config: str, + psrc_prompt_template_signature: str, + prompt_template: Type[Template], + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + lang: str = 'en' +) -> str: + """ + Given a `hf-dataset` (arg: dataset) and a prompt template (arg: prompt_template), + project/transform samples from all the splits of dataset (arg: dataset) into an instruction format and + writes in the disk (arg: dataset_output_dir) + + Args: + dataset_output_dir (str): Path to the output directory where data will be saved. + dataset_name (str): Name of the hf-dataset. + dataset_config (str): Name of the hf-dataset config. + psrc_prompt_template_signature (str): Name of the dataset & dataset-config for which prompts are written for. + prompt_template (Type[Template]): Transformation/projection module that will take a sample from arg:dataset and transform it to an instruction. + dataset (Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]): huggingface dataset that will be transformed into an instruction dataset. + lang (str = 'en'): language name of the dataset + """ + splits = list(dataset.keys()) + prompt_name = prompt_template.get_name() + for split in splits: + dataset_split = dataset[split] + json_data_path = os.path.join(dataset_output_dir, split) + os.makedirs(json_data_path, exist_ok=True) + __simp_dataset_name = dataset_name.replace("/", "_") + __simp_prompt_name = prompt_name.replace("/", "_") + json_data_path = os.path.join( + json_data_path, + f"foraiml_{__simp_dataset_name}_{lang}_{__simp_prompt_name}.jsonl" + ) + with open(json_data_path, "w", encoding="utf-8") as file_ptr: + total_num_sample = len(dataset_split) + for _id, sample in tqdm( + enumerate(dataset_split), + total=total_num_sample, + desc="{}_{}_{}_{}_{}".format( + dataset_name, + dataset_config, + split, + psrc_prompt_template_signature, + prompt_name, + ), + ): + # Project/transform sample into instruction. + prompted_sample = prompt_template.apply( + sample, highlight_variables=False + ) + answer_choice_list = prompt_template.get_answer_choices_list( + sample + ) # set of potential outcomes. + if ( + len(prompted_sample) != 2 + ): # if the prompt doesn't generate a tuple, that means it's an invalid prompted_sample + continue + source, target = prompted_sample + projected_sample_with_metadata = { + "inputs": source, # projected input for the language model. This is the instruction. (datatype: string) + "targets": target, # projected output for the language model. This is the gold response. (datatype: string) + } + + file_ptr.write(json.dumps(projected_sample_with_metadata)) + file_ptr.write("\n") + return "Completed:: {} !".format(json_data_path) + + +def invoke_none(lst: List[str]) -> Union[List[str], None]: + """ + helper function. + Takes a list of string and replace `None` where needed. + """ + for idx, val in enumerate(lst): + if val == "None" or val == "none" or val == "null" or val == "": + lst[idx] = None + return lst + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-name-or-paths", + nargs="+", + default="glue", + help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2). + A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates . + Usually prompt templates are written for a specific datasets. But in the case of a new dataset, + it is possible to apply a different (written for a different dataset) prompt template to a new dataset as long as + the JSON structure of the dataset is the same as what is required in the original prompt template.""", + ) + parser.add_argument( + "--dataset-configs", + nargs="+", + default=None, + help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file. + If there is no `--dataset-configs` in huggingface, use None. The first argument in the `--dataset-name-or-paths` refers to the + first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`.""", + ) + parser.add_argument( + "--prompt-templates-configs", + nargs="+", + default=None, + help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates. + The first argument in the `--dataset-name-or-paths` & `--dataset-configs` refers to the + first argument of the `--prompt-templates-configs`. There should be an equal number of argument in + `--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`""", + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Path to the output dir where the projected data will be stored.", + ) + parser.add_argument( + "--num-proc", type=int, default=9, help="Total number of parallel process." + ) + parser.add_argument( + "--add-source-metadata", + action="store_true", + help=""" + Add all the metadata from source dataset. This will create new keys names `src_meta_{original_keys}` + where this `original_keys` are all the keys from the original dataset key names (a.k.a column name). + These variable are kept with the completion so that we can recover the projection again if needed. + """, + ) + parser.add_argument( + "--highlight-variables", + action="store_true", + help="""Highlight token that are coming from the prompts and original dataset." + This feature can be used to differentiate prompt tokens and input tokens.""", + ) + parser.add_argument( + "--xp3-format", + action="store_true", + help="""Export the data in xP3 format""", + ) + parser.add_argument( + "--lang", + type=str, + default='en', + help="""Language name. Required for xP3 naming of the file.""", + ) + args = parser.parse_args() + + assert len(args.dataset_name_or_paths) == len(args.dataset_configs) + assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs) + export_dataset_func = xp3_export_dataset if args.xp3_format else export_dataset + if args.xp3_format and args.highlight_variables: + print(f"Ignoring {args.highlight_variables=} since {args.xp3_format}") + if args.xp3_format and args.add_source_metadata: + print(f"Ignoring {args.add_source_metadata=} since {args.xp3_format}") + + invoke_none(args.dataset_name_or_paths) + invoke_none(args.dataset_configs) + invoke_none(args.prompt_templates_configs) + + prompted_sample_gen_io_tuple_list = [] + # loading and caching each of the dataset & creating multiprocessor i/o for doing projection. + for (dataset_name_or_path, dataset_config, prompt_template_config) in zip( + args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs + ): + dataset = datasets.load_dataset( + dataset_name_or_path, dataset_config, cache_dir=args.cache_dir + ) + psrc_prompt_template_signature = prompt_template_config + if psrc_prompt_template_signature is None: + if dataset_config is None: + psrc_prompt_template_signature = "{}".format(dataset_name_or_path) + else: + psrc_prompt_template_signature = "{}/{}".format( + dataset_name_or_path, dataset_config + ) + dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path.replace("/", "_")) + os.makedirs(dataset_output_dir, exist_ok=True) + if dataset_config is not None: + dataset_output_dir = os.path.join(dataset_output_dir, dataset_config) + os.makedirs(dataset_output_dir, exist_ok=True) + prompt_templates = DatasetTemplates(psrc_prompt_template_signature) + prompt_names = list(prompt_templates.name_to_id_mapping.keys()) + for prompt_name in prompt_names: + prompt_template = prompt_templates[prompt_name] + # pre-calculate the arguments for multiprocesssing. + if args.xp3_format: + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset + ) + else: + prompted_sample_gen_io_tuple = ( + dataset_output_dir, + dataset_name_or_path, + dataset_config, + psrc_prompt_template_signature, + prompt_template, + dataset, + args.add_source_metadata, + args.highlight_variables, + ) + prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple) + + # Projecting data using multiprocessing. + # It's recommended to use large number of CPU machine if you are projecting multiple dataset. + # set up `--num-proc` accrodingly. + num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list)) + with concurrent.futures.ProcessPoolExecutor(max_workers=num_proc) as executor: + for _out in tqdm( + executor.map(export_dataset_func, *zip(*prompted_sample_gen_io_tuple_list)), + total=len(args.dataset_name_or_paths), + ): + try: + logger.info(_out) + except Exception as emsg: + logger.warning("Exception msg: {}".format(emsg)) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 9c6d441..1c3c7a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,3 +51,4 @@ wrapt==1.14.1 xxhash==3.2.0 yapf==0.32.0 yarl==1.8.2 +promptsource==0.2.3 diff --git a/scripts/project_from_promptsource.sh b/scripts/project_from_promptsource.sh new file mode 100644 index 0000000..36b644a --- /dev/null +++ b/scripts/project_from_promptsource.sh @@ -0,0 +1,44 @@ +# # The native format with a lots of metadata +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_promptsource.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--highlight-variables \ +--add-source-metadata \ +--num-proc 16 + +# # # The xP3 format +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_promptsource.py \ +--dataset-name-or-paths nq_open \ +--dataset-configs None \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--xp3-format \ +--num-proc 16 + +# Project custom data +DUMP_FOLDER='./raw' +SRC_DATA_FOLDER=$DUMP_FOLDER/project_from_promptsource +mkdir -p $SRC_DATA_FOLDER +mkdir -p $SRC_DATA_FOLDER/cache + +python data/project_from_promptsource.py \ +--dataset-name-or-paths shmuhammad/AfriSenti-twitter-sentiment \ +--dataset-configs amh \ +--prompt-templates-configs None \ +--cache-dir $SRC_DATA_FOLDER/cache \ +--output-dir $SRC_DATA_FOLDER \ +--num-proc 16 \ No newline at end of file