diff --git a/evals/evaluation/HELMET/README.md b/evals/evaluation/HELMET/README.md index 490426e9..12325290 100644 --- a/evals/evaluation/HELMET/README.md +++ b/evals/evaluation/HELMET/README.md @@ -157,7 +157,7 @@ python eval.py --config configs/cite.yaml --use_vllm Disclaimer: VLLM can be much faster than using the native HuggingFace generation; however, we found that the results can be slightly different, so we recommend using the native HuggingFace generation for the final evaluation. All reported results in the paper are from the native HuggingFace generation. -The speedup is much more noticable for tasks that generates more tokens (e.g., summarization may see up to 2x speedup), whereas the speedup is less noticable for tasks that generate fewer tokens (e.g., JSON KV may see less than 5% speedup). +The speedup is much more noticeable for tasks that generates more tokens (e.g., summarization may see up to 2x speedup), whereas the speedup is less noticeable for tasks that generate fewer tokens (e.g., JSON KV may see less than 5% speedup). @@ -211,7 +211,7 @@ Please also cite the original dataset creators, listed below: @inproceedings{mallen-etal-2023-trust, title = "When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories", author = "Mallen, Alex and - Asai, Akari and + Asia, Akari and Zhong, Victor and Das, Rajarshi and Khashabi, Daniel and diff --git a/evals/evaluation/HELMET/arguments.py b/evals/evaluation/HELMET/arguments.py index fac0ee67..093521ec 100644 --- a/evals/evaluation/HELMET/arguments.py +++ b/evals/evaluation/HELMET/arguments.py @@ -1,8 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse -import yaml import ast import os +import yaml + + def parse_arguments(): parser = argparse.ArgumentParser(description="evaluation on downstream tasks") parser.add_argument("--config", type=str, default=None, help="path to config file") @@ -27,27 +32,59 @@ def parse_arguments(): # evaluation settings parser.add_argument("--shots", type=int, default=5, help="total number of demos (encoder + decoder)") - parser.add_argument("--input_max_length", type=str, default='8192', help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets") + parser.add_argument( + "--input_max_length", + type=str, + default="8192", + help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets", + ) # generation settings - parser.add_argument("--do_sample", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use sampling (false is greedy), overwrites temperature") - parser.add_argument("--generation_max_length", type=str, default='10', help="max number of tokens to generate, can be separated by comma to match the specified datasets") + parser.add_argument( + "--do_sample", + type=ast.literal_eval, + choices=[True, False], + default=False, + help="whether to use sampling (false is greedy), overwrites temperature", + ) + parser.add_argument( + "--generation_max_length", + type=str, + default="10", + help="max number of tokens to generate, can be separated by comma to match the specified datasets", + ) parser.add_argument("--generation_min_length", type=int, default=0, help="min number of tokens to generate") parser.add_argument("--temperature", type=float, default=1.0, help="generation temperature") parser.add_argument("--top_p", type=float, default=1.0, help="top-p parameter for nucleus sampling") - parser.add_argument("--stop_newline", type=ast.literal_eval, choices=[True, False], default=False, help="whether to stop generation at newline") + parser.add_argument( + "--stop_newline", + type=ast.literal_eval, + choices=[True, False], + default=False, + help="whether to stop generation at newline", + ) # model specific settings parser.add_argument("--seed", type=int, default=42, help="random seed") parser.add_argument("--no_cuda", action="store_true", help="disable cuda") parser.add_argument("--no_bf16", action="store_true", help="disable bf16 and use fp32") parser.add_argument("--no_torch_compile", action="store_true", help="disable cuda") - parser.add_argument("--use_chat_template", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use chat template") + parser.add_argument( + "--use_chat_template", + type=ast.literal_eval, + choices=[True, False], + default=False, + help="whether to use chat template", + ) parser.add_argument("--rope_theta", type=int, default=None, help="override rope theta") # misc parser.add_argument("--debug", action="store_true", help="for debugging") - parser.add_argument("--count_tokens", action="store_true", help="instead of running generation, just count the number of tokens (only for HF models not API)") + parser.add_argument( + "--count_tokens", + action="store_true", + help="instead of running generation, just count the number of tokens (only for HF models not API)", + ) args = parser.parse_args() config = yaml.safe_load(open(args.config)) if args.config is not None else {} diff --git a/evals/evaluation/HELMET/configs/cite.yaml b/evals/evaluation/HELMET/configs/cite.yaml index 58f45fac..3657ef33 100644 --- a/evals/evaluation/HELMET/configs/cite.yaml +++ b/evals/evaluation/HELMET/configs/cite.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072 datasets: alce_asqa_700,alce_qampari_700 generation_max_length: 300,300 diff --git a/evals/evaluation/HELMET/configs/cite_short.yaml b/evals/evaluation/HELMET/configs/cite_short.yaml index d6714b33..8819ab5d 100644 --- a/evals/evaluation/HELMET/configs/cite_short.yaml +++ b/evals/evaluation/HELMET/configs/cite_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536 datasets: alce_asqa_30,alce_asqa_75,alce_asqa_165,alce_asqa_345,alce_qampari_30,alce_qampari_75,alce_qampari_165,alce_qampari_345 generation_max_length: 300,300,300,300,300,300,300,300 diff --git a/evals/evaluation/HELMET/configs/icl.yaml b/evals/evaluation/HELMET/configs/icl.yaml index ace3f467..06549ccf 100644 --- a/evals/evaluation/HELMET/configs/icl.yaml +++ b/evals/evaluation/HELMET/configs/icl.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072,131072,131072,131072 datasets: icl_trec_coarse_6600shot_balance,icl_trec_fine_6400shot_balance,icl_banking77_5900shot_balance,icl_clinic150_7050shot_balance,icl_nlu_8296shot_balance generation_max_length: 20,20,20,20,20 diff --git a/evals/evaluation/HELMET/configs/icl_short.yaml b/evals/evaluation/HELMET/configs/icl_short.yaml index 3404b943..d93ba9c4 100644 --- a/evals/evaluation/HELMET/configs/icl_short.yaml +++ b/evals/evaluation/HELMET/configs/icl_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 datasets: icl_trec_coarse_400shot_balance,icl_trec_coarse_800shot_balance,icl_trec_coarse_1600shot_balance,icl_trec_coarse_3300shot_balance,icl_trec_fine_400shot_balance,icl_trec_fine_800shot_balance,icl_trec_fine_1600shot_balance,icl_trec_fine_3200shot_balance,icl_banking77_360shot_balance,icl_banking77_720shot_balance,icl_banking77_1450shot_balance,icl_banking77_2900shot_balance,icl_clinic150_440shot_balance,icl_clinic150_880shot_balance,icl_clinic150_1750shot_balance,icl_clinic150_3525shot_balance,icl_nlu_510shot_balance,icl_nlu_1020shot_balance,icl_nlu_2040shot_balance,icl_nlu_4080shot_balance generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20 diff --git a/evals/evaluation/HELMET/configs/longqa.yaml b/evals/evaluation/HELMET/configs/longqa.yaml index 3ccb43c5..29eeba38 100644 --- a/evals/evaluation/HELMET/configs/longqa.yaml +++ b/evals/evaluation/HELMET/configs/longqa.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072,131072 datasets: narrativeqa_130772,infbench_qa_eng_130862,infbench_choice_eng_130862 generation_max_length: 100,10,10 diff --git a/evals/evaluation/HELMET/configs/longqa_short.yaml b/evals/evaluation/HELMET/configs/longqa_short.yaml index fe96348a..1b423c16 100644 --- a/evals/evaluation/HELMET/configs/longqa_short.yaml +++ b/evals/evaluation/HELMET/configs/longqa_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 datasets: narrativeqa_7892,narrativeqa_16084,narrativeqa_32468,narrativeqa_65236,infbench_qa_eng_7982,infbench_qa_eng_16174,infbench_qa_eng_32558,infbench_qa_eng_65326,infbench_choice_eng_7982,infbench_choice_eng_16174,infbench_choice_eng_32558,infbench_choice_eng_65326 generation_max_length: 100,100,100,100,10,10,10,10,10,10,10,10 diff --git a/evals/evaluation/HELMET/configs/niah.yaml b/evals/evaluation/HELMET/configs/niah.yaml index b90f52de..bad80acb 100644 --- a/evals/evaluation/HELMET/configs/niah.yaml +++ b/evals/evaluation/HELMET/configs/niah.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072 datasets: ruler_niah_s_2 generation_max_length: 50 diff --git a/evals/evaluation/HELMET/configs/niah_long.yaml b/evals/evaluation/HELMET/configs/niah_long.yaml index b3f79e3b..c485b071 100644 --- a/evals/evaluation/HELMET/configs/niah_long.yaml +++ b/evals/evaluation/HELMET/configs/niah_long.yaml @@ -1,7 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072 datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2 generation_max_length: 50,50,50,50,50,50,50,50,50,50,100,100,100,100,50,50,100,100,50,50,50,50,50,50,50,50 -test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl +test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/few/validation_65536.jsonl,data/ruler/few/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,' use_chat_template: false max_test_samples: 100 diff --git a/evals/evaluation/HELMET/configs/rag.yaml b/evals/evaluation/HELMET/configs/rag.yaml index cfc9de3e..2df6d5c9 100644 --- a/evals/evaluation/HELMET/configs/rag.yaml +++ b/evals/evaluation/HELMET/configs/rag.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072,131072,131072 datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3 generation_max_length: 20,20,20,20 diff --git a/evals/evaluation/HELMET/configs/rag_short.yaml b/evals/evaluation/HELMET/configs/rag_short.yaml index 7a3f3d06..bda6de31 100644 --- a/evals/evaluation/HELMET/configs/rag_short.yaml +++ b/evals/evaluation/HELMET/configs/rag_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 datasets: kilt_nq,kilt_nq,kilt_nq,kilt_nq,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3 generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20 diff --git a/evals/evaluation/HELMET/configs/recall.yaml b/evals/evaluation/HELMET/configs/recall.yaml index 7a87ea26..367ddec4 100644 --- a/evals/evaluation/HELMET/configs/recall.yaml +++ b/evals/evaluation/HELMET/configs/recall.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072,131072,131072 datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv generation_max_length: 50,100,50,100 diff --git a/evals/evaluation/HELMET/configs/recall_short.yaml b/evals/evaluation/HELMET/configs/recall_short.yaml index 025551c2..1d4b9970 100644 --- a/evals/evaluation/HELMET/configs/recall_short.yaml +++ b/evals/evaluation/HELMET/configs/recall_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 datasets: ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,json_kv,json_kv,json_kv,json_kv generation_max_length: 50,50,50,50,100,100,100,100,50,50,50,50,100,100,100,100 diff --git a/evals/evaluation/HELMET/configs/rerank.yaml b/evals/evaluation/HELMET/configs/rerank.yaml index 5b3fba29..12023e7e 100644 --- a/evals/evaluation/HELMET/configs/rerank.yaml +++ b/evals/evaluation/HELMET/configs/rerank.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: '131072' datasets: msmarco_rerank_psg generation_max_length: '200' diff --git a/evals/evaluation/HELMET/configs/rerank_short.yaml b/evals/evaluation/HELMET/configs/rerank_short.yaml index 90a957e2..1d5508eb 100644 --- a/evals/evaluation/HELMET/configs/rerank_short.yaml +++ b/evals/evaluation/HELMET/configs/rerank_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536 datasets: msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg generation_max_length: 200,200,200,200 diff --git a/evals/evaluation/HELMET/configs/summ.yaml b/evals/evaluation/HELMET/configs/summ.yaml index 53d67ed5..08cd5847 100644 --- a/evals/evaluation/HELMET/configs/summ.yaml +++ b/evals/evaluation/HELMET/configs/summ.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 131072,131072 datasets: infbench_sum_eng_129672,multi_lexsum_130372 generation_max_length: 1200,400 diff --git a/evals/evaluation/HELMET/configs/summ_short.yaml b/evals/evaluation/HELMET/configs/summ_short.yaml index de81cd57..4b7729bb 100644 --- a/evals/evaluation/HELMET/configs/summ_short.yaml +++ b/evals/evaluation/HELMET/configs/summ_short.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536 datasets: infbench_sum_eng_6792,infbench_sum_eng_14984,infbench_sum_eng_31368,infbench_sum_eng_64136,multi_lexsum_7492,multi_lexsum_15684,multi_lexsum_32068,multi_lexsum_64836 generation_max_length: 1200,1200,1200,1200,400,400,400,400 diff --git a/evals/evaluation/HELMET/data.py b/evals/evaluation/HELMET/data.py index 9efac614..a9cc9936 100644 --- a/evals/evaluation/HELMET/data.py +++ b/evals/evaluation/HELMET/data.py @@ -1,23 +1,24 @@ -import json -import os -import sys +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import copy +import json +import logging import math +import os import random -import numpy as np - +import re +import sys from collections import defaultdict + +import numpy as np from datasets import load_dataset, load_from_disk from torch.utils.data import Dataset from tqdm import tqdm from transformers import AutoTokenizer +from utils import calculate_metrics, calculate_retrieval_metrics, parse_output, parse_rankings -import re -from utils import calculate_metrics, parse_output, parse_rankings, calculate_retrieval_metrics - -import logging -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S') +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -48,9 +49,7 @@ def drop_duplicates(data, key="id"): def load_qa(dataset, path, demo_path, max_test_samples=None, popularity_threshold=None, shots=0): - """ - Load the data for QA tasks - """ + """Load the data for QA tasks.""" if "nq_bad" in dataset: user_template = "Use the given documents to write a concise and short answer to the question. Only use the information presented in the documents, and output 'unanswerable' if the question is not valid or cannot be answered with the given document. Write your answer in the following format:\nAnswer: [answer]\n\n{demos}{context}\n\nQuestion: {question}" else: @@ -64,8 +63,13 @@ def load_qa(dataset, path, demo_path, max_test_samples=None, popularity_threshol data = load_dataset("json", data_files=path)["train"] else: data = load_from_disk(path) - return {"data": data, "prompt_template": prompt_template, "user_template": user_template, "system_template": system_template} - + return { + "data": data, + "prompt_template": prompt_template, + "user_template": user_template, + "system_template": system_template, + } + if demo_path.endswith(".json"): if "nq_bad" in dataset: with open(demo_path) as f: @@ -77,8 +81,8 @@ def load_qa(dataset, path, demo_path, max_test_samples=None, popularity_threshol # popularity filtering for popqa if "popqa" in dataset and popularity_threshold is not None: - data = data.filter(lambda x: math.log10(x['s_pop']) < popularity_threshold) - demo_data = demo_data.filter(lambda x: math.log10(x['s_pop']) < popularity_threshold) + data = data.filter(lambda x: math.log10(x["s_pop"]) < popularity_threshold) + demo_data = demo_data.filter(lambda x: math.log10(x["s_pop"]) < popularity_threshold) key = "id" if "id" in data.column_names else "question" if max_test_samples is not None: @@ -90,22 +94,36 @@ def load_qa(dataset, path, demo_path, max_test_samples=None, popularity_threshol # demo_template = "Document (Title: {gold_title}): {gold_doc}\n\nQuestion: {question}\nAnswer: {answer}" demo_template = "{documents}\n\nQuestion: {question}\nAnswer: {answer}" passage_template = "Document (Title: {title}): {text}" + def update(sample): demos = demo_data demo_text = "" if shots > 0: - if 'popqa' in dataset: + if "popqa" in dataset: # popqa only has one split demos = demo_data.filter(lambda x: x[key] != sample[key]) # seed ensures that we get the same demos for the same question demos = demos.shuffle(seed=abs(hash(sample[key])) % (2**31)) demos = drop_duplicates(demos, key).select(range(shots)) - demo_text = "\n\n".join([demo_template.format(**d, documents="\n\n".join([passage_template.format(**c) for c in d["ctxs"]]), answer=d["answers"][0]) for d in demos]) + "\n\n" + demo_text = ( + "\n\n".join( + [ + demo_template.format( + **d, + documents="\n\n".join([passage_template.format(**c) for c in d["ctxs"]]), + answer=d["answers"][0], + ) + for d in demos + ] + ) + + "\n\n" + ) passage_text = "" - if len(sample['ctxs']) > 0: - passage_text = "\n\n".join([passage_template.format(**c) for c in sample['ctxs']]) + if len(sample["ctxs"]) > 0: + passage_text = "\n\n".join([passage_template.format(**c) for c in sample["ctxs"]]) return {"demos": demo_text, "context": passage_text, "answer": sample["answers"]} + data = data.map(update) return { @@ -128,13 +146,23 @@ def load_json_kv(path, shots, max_test_samples=None, seed=42): data = load_dataset("json", data_files=path)["train"] else: data = load_from_disk(path) - return {"data": data, "prompt_template": prompt_template, "user_template": user_template, "system_template": system_template} + return { + "data": data, + "prompt_template": prompt_template, + "user_template": user_template, + "system_template": system_template, + } demo_template = "Key: {key}\nCorresponding value:{value}" - data = data.map(lambda x: { - "demos": "\n\n".join([demo_template.format(key=key, value=" "+value) for key, value in x["demos"][:shots]]) + ("\n\n" if shots > 0 else ""), - "k": x["num_kvs"], - }) + data = data.map( + lambda x: { + "demos": "\n\n".join( + [demo_template.format(key=key, value=" " + value) for key, value in x["demos"][:shots]] + ) + + ("\n\n" if shots > 0 else ""), + "k": x["num_kvs"], + } + ) if max_test_samples is not None: data = data.shuffle(seed=seed).select(range(min(max_test_samples, len(data)))) @@ -150,9 +178,9 @@ def post_process(output, example): return mets, {"parsed_output": parsed_pred} return { - "data": data, - "prompt_template": prompt_template, - "user_template": user_template, + "data": data, + "prompt_template": prompt_template, + "user_template": user_template, "system_template": system_template, "post_process": post_process, } @@ -161,17 +189,20 @@ def post_process(output, example): def truncate_llama2(dataset, data, postfix_text=" ... [the rest of the text is omitted]"): # use the llama 2 tokenizer to truncate to max_length, which only applies to the main document (context) and exclude the instructions and the demos # this is to make sure that every model see the same amount of information - max_length = int(dataset.split("_")[-1]) + max_length = int(dataset.split("_")[-1]) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") separator_length = len(tokenizer(postfix_text)["input_ids"]) - + def truncate(sample): # tokens = tokenizer(sample["context"], max_length=max_length, truncation=True, return_offsets_mapping=True) tokens = tokenizer(sample["context"], return_offsets_mapping=True) if len(tokens["input_ids"]) > max_length: # we need to truncate - sample["context"] = sample["context"][:tokens["offset_mapping"][max_length-separator_length][1]] + postfix_text + sample["context"] = ( + sample["context"][: tokens["offset_mapping"][max_length - separator_length][1]] + postfix_text + ) return sample + return data.map(truncate, num_proc=16) @@ -187,12 +218,26 @@ def load_narrativeqa(dataset, path=None, shots=0, max_samples=None, seed=42): data = all_data["test"].shuffle(seed=seed) if max_samples is not None: data = data.select(range(min(max_samples, len(data)))) - data = data.map(lambda example: { - "context": example["document"]["text"], - "question": example["question"]["text"], - "answer": [ex["text"] for ex in example["answers"]], - "demo": "" if shots == 0 else "For example:\n\n" + "\n\n".join([f"Question: {ex['question']['text']}\nAnswer: {ex['answers'][0]['text']}" for ex in all_data["train"].shuffle().select(range(shots))]) + "\n\nNow, use the following story to answer the question:\n\n" - }, remove_columns=["document", "answers"]) + data = data.map( + lambda example: { + "context": example["document"]["text"], + "question": example["question"]["text"], + "answer": [ex["text"] for ex in example["answers"]], + "demo": ( + "" + if shots == 0 + else "For example:\n\n" + + "\n\n".join( + [ + f"Question: {ex['question']['text']}\nAnswer: {ex['answers'][0]['text']}" + for ex in all_data["train"].shuffle().select(range(shots)) + ] + ) + + "\n\nNow, use the following story to answer the question:\n\n" + ), + }, + remove_columns=["document", "answers"], + ) data = truncate_llama2(dataset, data) return { @@ -236,16 +281,37 @@ def load_qasper(dataset, path=None, shots=0, max_samples=None, seed=42): if max_samples is not None: data = data.select(range(min(max_samples, len(data)))) - data = data.map(lambda example: { - "context": example["input"][example["input"].index("\n\n")+2:].strip(), - "question": example["input"][:example["input"].index("\n\n")].strip(), - "answer": example["outputs"], - # "demo": "" if shots == 0 else "\n\n".join(["[Text omitted]\n\nQuestion: {}\nAnswer: {}".format(ex['input'][:ex['input'].index('\n\n')].strip(), ex['outputs'][0]) for ex in train_data.shuffle().select(range(shots))]) + "\n\n" - "demo": "" if shots == 0 else "For example:\n\n" + "\n\n".join(["Question: {}\nAnswer: {}".format(ex['input'][:ex['input'].index('\n\n')].strip(), ex['outputs'][0]) for ex in train_data.shuffle().select(range(shots))]) + "\n\nNow, use the following article to answer the question:\n\n" - }, remove_columns=["outputs"]) + data = data.map( + lambda example: { + "context": example["input"][example["input"].index("\n\n") + 2 :].strip(), + "question": example["input"][: example["input"].index("\n\n")].strip(), + "answer": example["outputs"], + # "demo": "" if shots == 0 else "\n\n".join(["[Text omitted]\n\nQuestion: {}\nAnswer: {}".format(ex['input'][:ex['input'].index('\n\n')].strip(), ex['outputs'][0]) for ex in train_data.shuffle().select(range(shots))]) + "\n\n" + "demo": ( + "" + if shots == 0 + else "For example:\n\n" + + "\n\n".join( + [ + "Question: {}\nAnswer: {}".format( + ex["input"][: ex["input"].index("\n\n")].strip(), ex["outputs"][0] + ) + for ex in train_data.shuffle().select(range(shots)) + ] + ) + + "\n\nNow, use the following article to answer the question:\n\n" + ), + }, + remove_columns=["outputs"], + ) data = truncate_llama2(dataset, data) - - return {"data": data, "prompt_template": prompt_template, "user_template": user_template, "system_template": system_template} + + return { + "data": data, + "prompt_template": prompt_template, + "user_template": user_template, + "system_template": system_template, + } def load_multi_lexsum(dataset, path=None, shots=0, max_samples=None, seed=42): @@ -257,12 +323,22 @@ def load_multi_lexsum(dataset, path=None, shots=0, max_samples=None, seed=42): prompt_template = user_template + "\n\n" + system_template train_data = all_data["train"] - all_data = all_data.map(lambda x: { - "context": '\n\n'.join(x["sources"]), - "demo": "" if shots == 0 else "Example summaries:\n\n" + "\n\n".join(["Summary: {}".format(ex["summary/short"]) for ex in train_data.shuffle().select(range(shots))]) + "\n\nNow, write a summary of the following legal documents.\n", - "answer": x["summary/short"], - "question": "", - }) + all_data = all_data.map( + lambda x: { + "context": "\n\n".join(x["sources"]), + "demo": ( + "" + if shots == 0 + else "Example summaries:\n\n" + + "\n\n".join( + ["Summary: {}".format(ex["summary/short"]) for ex in train_data.shuffle().select(range(shots))] + ) + + "\n\nNow, write a summary of the following legal documents.\n" + ), + "answer": x["summary/short"], + "question": "", + } + ) all_data = truncate_llama2(dataset, all_data) test_data = all_data["validation"] @@ -279,7 +355,7 @@ def post_process(output, example): if max_samples is not None and len(test_data) > max_samples: test_data = test_data.shuffle(seed=seed).select(range(max_samples)) - + return { "data": test_data, "prompt_template": prompt_template, @@ -300,7 +376,7 @@ def load_msmarco_rerank(path, demo_path=None, max_test_samples=None, shots=0, se data = load_dataset("json", data_files=path)["train"] else: data = load_from_disk(path) - + demos = load_dataset("json", data_files=demo_path)["train"] def get_qrels(data): @@ -315,7 +391,7 @@ def get_qrels(data): keys = set(data[key]) keys = random.sample(sorted(keys), min(max_test_samples, len(keys))) data = data.filter(lambda x: x[key] in keys) - + # the k values are used to calculate metrics later k_values = [1, 5, 10, 20, 50, 100, 200, 500, 1000] k_values = [k for k in k_values if k <= len(data[0]["ctxs"])] @@ -323,7 +399,7 @@ def get_qrels(data): # could also do this question by question, but not necessary if we are sampling demo_filtered = False - if len(demos) > 2*len(data): + if len(demos) > 2 * len(data): qids = set(data["qid"]) demos = demos.filter(lambda x: x["qid"] not in qids) demo_filtered = True @@ -331,9 +407,13 @@ def get_qrels(data): def update(sample, demos): passage_text = "" - passage_template = "[ID: {id}] Document (Title: {title}): {text}" if "title" in sample["ctxs"][0] else "[ID: {id}] Document: {text}" - passage_text = "\n\n".join([passage_template.format(**c) for c in sample['ctxs']]) - gold_ranking = " > ".join([x['id'] for x in sorted(sample["ctxs"], key=lambda x: x["label"], reverse=True)]) + passage_template = ( + "[ID: {id}] Document (Title: {title}): {text}" + if "title" in sample["ctxs"][0] + else "[ID: {id}] Document: {text}" + ) + passage_text = "\n\n".join([passage_template.format(**c) for c in sample["ctxs"]]) + gold_ranking = " > ".join([x["id"] for x in sorted(sample["ctxs"], key=lambda x: x["label"], reverse=True)]) demo_text = "" if shots > 0: @@ -341,8 +421,8 @@ def update(sample, demos): if not demo_filtered: demos = demos.filter(lambda x: x["qid"] != sample["qid"]) demo = demos.shuffle(seed=abs(hash(sample["qid"])) % (2**31)) - demo = drop_duplicates(demo, 'qid').select(range(shots)) - + demo = drop_duplicates(demo, "qid").select(range(shots)) + demo_ids = set() for d in demo: if d["qid"] in demo_ids or len(demo_ids) >= shots: @@ -350,8 +430,12 @@ def update(sample, demos): demo_ids.add(d["qid"]) # sort ids by label ids = sorted(d["ctxs"], key=lambda x: x["label"], reverse=True) - ranking = " > ".join([x['id'] for x in ids]) - demo_text += "\n\n".join([passage_template.format(**c) for c in d['ctxs']]) + f"\n\nQuery: {d['query']}\nRanking: {ranking}" + "\n\n" + ranking = " > ".join([x["id"] for x in ids]) + demo_text += ( + "\n\n".join([passage_template.format(**c) for c in d["ctxs"]]) + + f"\n\nQuery: {d['query']}\nRanking: {ranking}" + + "\n\n" + ) return {"context": passage_text, "question": sample["query"], "demos": demo_text, "answer": gold_ranking} @@ -361,7 +445,7 @@ def post_process(output, example): parsed_pred = parse_rankings(output["output"]) o = {"parsed_output": parsed_pred} # qrels = {k: v for k, v in example["qrel"].items() if v is not None} - mets = calculate_retrieval_metrics({example['qid']: parsed_pred}, qrels, k_values) + mets = calculate_retrieval_metrics({example["qid"]: parsed_pred}, qrels, k_values) mets = {**mets, "num_preds": len(parsed_pred)} return mets, o @@ -382,14 +466,14 @@ def load_icl(dataset, max_test_sample=None, seed=42): if "trec_fine" in dataset.lower(): train_data = load_dataset("CogComp/trec", trust_remote_code=True)["train"] test_data = load_dataset("CogComp/trec", trust_remote_code=True)["test"] - id2label = train_data.features['fine_label'].names + id2label = train_data.features["fine_label"].names text_field = "text" label_field = "fine_label" num_labels = 50 elif "trec_coarse" in dataset.lower(): train_data = load_dataset("CogComp/trec", trust_remote_code=True)["train"] test_data = load_dataset("CogComp/trec", trust_remote_code=True)["test"] - id2label = train_data.features['coarse_label'].names + id2label = train_data.features["coarse_label"].names text_field = "text" label_field = "coarse_label" num_labels = 6 @@ -418,8 +502,8 @@ def load_icl(dataset, max_test_sample=None, seed=42): label_field = "label" num_labels = 68 else: - raise NotImplementedError(f"Unknown ICL dataset") - + raise NotImplementedError("Unknown ICL dataset") + def balance_labels(data, shots): # for each data point, we are going to sample a random set of demos with balanced labels # there are two places where randomness is involved: the selection of the demos and the final shuffle @@ -428,7 +512,7 @@ def balance_labels(data, shots): label_mapping = {x[label_field]: [] for x in data} for x in data: label_mapping[x[label_field]].append(x) - + # rearrange the data such that every label has the same number of samples # they are also in consecutive sets with random order in each set num_rounds = math.ceil(shots / len(label_mapping)) @@ -436,9 +520,9 @@ def balance_labels(data, shots): for _, samples in label_mapping.items(): indices = rand.sample(range(len(samples)), num_rounds % len(samples)) while len(indices) < num_rounds: - # sample with replacement if necessary, shouldn't happen unless we have very many shots + # sample with replacement if necessary, shouldn't happen unless we have very many shots indices += rand.sample(range(len(samples)), min(num_rounds - len(indices), len(samples))) - + for i, idx in enumerate(indices): new_data[i].append(samples[idx]) @@ -446,12 +530,12 @@ def balance_labels(data, shots): rand.shuffle(new_data[i]) new_data = [item for sublist in new_data for item in sublist][:shots] return new_data - + if max_test_sample is not None and len(test_data) > max_test_sample: test_data = test_data.shuffle(seed=seed).select(range(max_test_sample)) item_template = "{text}\nlabel: {label}" - user_template = "Use the provided mapping from the text to label to assign a label to the text. Only output \"label: {{label}}\" and nothing else. \n\n{context}\n\n{question}" + user_template = 'Use the provided mapping from the text to label to assign a label to the text. Only output "label: {{label}}" and nothing else. \n\n{context}\n\n{question}' system_template = "label:" prompt_template = user_template + "\n" + system_template @@ -474,12 +558,20 @@ def preprocess(sample): random.seed(local_seed) random.shuffle(label_mapping) - context = "\n\n".join([ - item_template.format(text=selected_item[text_field], label=str(label_mapping[int(selected_item[label_field])])) - for selected_item in demos] + context = "\n\n".join( + [ + item_template.format( + text=selected_item[text_field], label=str(label_mapping[int(selected_item[label_field])]) + ) + for selected_item in demos + ] ) - return {"context": context, "question": sample[text_field], "answer": str(label_mapping[int(sample[label_field])])} - + return { + "context": context, + "question": sample[text_field], + "answer": str(label_mapping[int(sample[label_field])]), + } + final_data = test_data.map(preprocess, num_proc=40) def post_process(output, example): @@ -517,7 +609,7 @@ def load_ruler(dataset, path, max_test_samples=None, seed=42): elif "cwe" in dataset: user_template = "{example}Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.\n{context}\nQuestion: What are the 10 most common words in the above list?" system_template = "Answer: The top 10 words that appear most often in the list are:" - elif "fwe" in dataset: + elif "few" in dataset: user_template = "Read the following coded text and track the frequency of each coded word. Find the three most frequently appeared coded words.\n{context}\nQuestion: Do not provide any explanation. Please ignore the dots '....'. What are the three most frequently appeared words in the above coded text?" system_template = "Answer: According to the coded text above, the three most frequently appeared words are:" elif "qa" in dataset: @@ -530,10 +622,13 @@ def load_ruler(dataset, path, max_test_samples=None, seed=42): def process_example(example): return { - "question": example["query"] if "query" in example else example["question"] if "question" in example else "", + "question": ( + example["query"] if "query" in example else example["question"] if "question" in example else "" + ), "example": example["example"] + "\n\n" if "example" in example and example["example"] != "" else "", - "answer": example["answer"] if "answer" in example else example['outputs'], + "answer": example["answer"] if "answer" in example else example["outputs"], } + data = data.map(process_example) def post_process(output, example): @@ -543,7 +638,7 @@ def post_process(output, example): recall = sum([a.lower() in prediction.lower() for a in answer]) / len(answer) mets = {"ruler_recall": recall} return mets, {"parsed_output": prediction} - + if max_test_samples is not None: data = data.shuffle(seed).select(range(min(len(data), max_test_samples))) @@ -564,7 +659,7 @@ def load_alce(dataset, path, demo_path, shots=0): demo_prompt = demos["demo_prompt"] doc_prompt = demos["doc_prompt"] # there are 5 docs for each demo, and we use all of them - + user_template = "{demo_text}\n\n\n{instruction}\n\nQuestion: {question}\n\n{context}" system_template = "Answer:" prompt_template = user_template + "\n\n" + system_template @@ -574,14 +669,21 @@ def load_alce(dataset, path, demo_path, shots=0): num_docs = int(dataset.split("_")[-1]) def preprocess_example(example): - context = "\n\n".join([doc_prompt.format(**d, ID=idx+1) for idx, d in enumerate(example["docs"][:num_docs])]) - demo_text = "\n\n\n".join([ - demo_prompt.format(**demo, instruction=instruction, context = "\n\n".join([doc_prompt.format(**d, ID=idx+1) for idx, d in enumerate(demo["docs"])])) - for demo in random.sample(demos["demos"], shots) - ]) + context = "\n\n".join([doc_prompt.format(**d, ID=idx + 1) for idx, d in enumerate(example["docs"][:num_docs])]) + demo_text = "\n\n\n".join( + [ + demo_prompt.format( + **demo, + instruction=instruction, + context="\n\n".join([doc_prompt.format(**d, ID=idx + 1) for idx, d in enumerate(demo["docs"])]), + ) + for demo in random.sample(demos["demos"], shots) + ] + ) return {"context": context, "demo_text": demo_text, "instruction": instruction} + data = data.map(preprocess_example) - + return { "data": data, "prompt_template": prompt_template, @@ -591,11 +693,20 @@ def preprocess_example(example): def load_infbench(dataset, shots=0, max_test_samples=None, seed=42): - from datasets import load_dataset, Value, Sequence, Features - ft = Features({"id": Value("int64"), "context": Value("string"), "input": Value("string"), "answer": Sequence(Value("string")), "options": Sequence(Value("string"))}) + from datasets import Features, Sequence, Value, load_dataset + + ft = Features( + { + "id": Value("int64"), + "context": Value("string"), + "input": Value("string"), + "answer": Sequence(Value("string")), + "options": Sequence(Value("string")), + } + ) data = load_dataset("xinrongzhang2022/infinitebench", features=ft) - - # https://github.com/OpenBMB/InfiniteBench/blob/main/src/prompt.py + + # https://github.com/OpenBMB/InfiniteBench/blob/main/src/prompt.py # slightly modified to be consistent with other datasets, shouldn't affect performance post_process = default_post_process if "qa_eng" in dataset: @@ -606,6 +717,7 @@ def load_infbench(dataset, shots=0, max_test_samples=None, seed=42): user_template = "You are given a story and a question with multiple choices. Choose the best answer from the options provided. Only one of the following options is correct, output the answer using one single letter (A, B, C, or D). Don't say anything else.\n\n{demo}{context}\n\nQuestion: {question}\nOptions:\n{options}" system_template = "Answer:" data = data["longbook_choice_eng"] + def pp(output, example): prediction = output["output"] answer = example["answer"] @@ -628,7 +740,7 @@ def pp(output, example): return mets, {"parsed_output": parsed_pred} post_process = pp - + elif "sum_eng" in dataset: user_template = "You are given a book and you are tasked to summarize it. Write a summary of about 1000 to 1200 words. Only write about the plot and characters of the story. Do not discuss the themes or background of the book. Do not provide any analysis or commentary.\n\n{demo}{context}\n\nNow summarize the book." system_template = "Summary:" @@ -644,7 +756,7 @@ def process_example(example): update["options"] = options update["answer"] = [answer, f"{answer}. {example['answer'][0]}"] return update - + data = truncate_llama2(dataset, data) all_data = data.map(process_example) @@ -663,6 +775,7 @@ def add_demos(example): elif "sum_eng" in dataset: demo = "\n\n".join([f"[story text]\nSummary: {x['answer'][0].strip()}" for x in demos]) return {"demo": f"For example:\n\n{demo}\n\nNow, read the following story:\n\n"} + if shots > 0: data = data.map(add_demos) @@ -674,13 +787,14 @@ def add_demos(example): "post_process": post_process, } + def shuffle_labels(data, method="shuffle"): - """ - For classification tasks with fixed number of labels, we can shuffle the labels to make the task harder. + """For classification tasks with fixed number of labels, we can shuffle the labels to make the task harder. + The model needs to rely on the demo more than using the clue from the label names. We support different ways of doing this. 1. shuffle -- the label names don't change but we shuffle them (a bijection mapping from old to new and different label) - 2. numbers -- change labels to 0 to n-1 + 2. numbers -- change labels to 0 to n-1 3. uuid -- change labels to random uuids """ # 1. create the mapping from original label to the new label @@ -688,11 +802,12 @@ def shuffle_labels(data, method="shuffle"): if method == "shuffle": # random shuffle and then create a mapping, this gives us a random bijection mapping random.shuffle(label_set) - mapping = {label_set[i]: label_set[(i+1) % len(label_set)] for i in range(len(label_set))} + mapping = {label_set[i]: label_set[(i + 1) % len(label_set)] for i in range(len(label_set))} elif method == "numbers": mapping = {label: i for i, label in enumerate(label_set)} elif method == "uuid": import uuid + mapping = {label: str(uuid.uuid4()) for label in label_set} else: raise NotImplementedError(f"Unknown method {method}") @@ -701,14 +816,19 @@ def shuffle_labels(data, method="shuffle"): # 2. replace the original label with the new label in the text # we do the replace with system_template prepend to avoid replacing the label strings that are also substrings of the test text pattern = re.compile("|".join(mapping.keys())) + def replace(sample): - context_mapping = {data["system_template"].format(sample) + " " + k: data["system_template"].format(sample) + " " + v for k, v in mapping.items()} + context_mapping = { + data["system_template"].format(sample) + " " + k: data["system_template"].format(sample) + " " + v + for k, v in mapping.items() + } context_pattern = re.compile("|".join(context_mapping.keys())) return { "context": pattern.sub(lambda x: mapping[re.escape(x.group(0))], sample["context"]), "answer": mapping[sample["answer"]], "original_answer": sample["answer"], } + data["data"] = data["data"].map(replace) @@ -730,7 +850,14 @@ def default_post_process(output, example): def load_data(args, dataset, path=None, demo_path=None): if "popqa" in dataset: popularity_threshold = float(dataset.split("_")[-1]) - data = load_qa(dataset, path, demo_path, max_test_samples=args.max_test_samples, popularity_threshold=popularity_threshold, shots=args.shots) + data = load_qa( + dataset, + path, + demo_path, + max_test_samples=args.max_test_samples, + popularity_threshold=popularity_threshold, + shots=args.shots, + ) elif any([x in dataset for x in ["nq", "hotpotqa", "triviaqa"]]): data = load_qa(dataset, path, demo_path, max_test_samples=args.max_test_samples, shots=args.shots) elif dataset == "json_kv": @@ -744,7 +871,9 @@ def load_data(args, dataset, path=None, demo_path=None): elif "alce" in dataset: data = load_alce(dataset, path, demo_path, args.shots) if args.max_test_samples is not None: - data["data"] = data["data"].shuffle(seed=args.seed).select(range(min(args.max_test_samples, len(data["data"])))) + data["data"] = ( + data["data"].shuffle(seed=args.seed).select(range(min(args.max_test_samples, len(data["data"])))) + ) elif "icl" in dataset: data = load_icl(dataset, max_test_sample=args.max_test_samples, seed=args.seed) elif "multi_lexsum" in dataset: @@ -757,10 +886,10 @@ def load_data(args, dataset, path=None, demo_path=None): data = load_infbench(dataset, args.shots, args.max_test_samples, seed=args.seed) else: raise ValueError(f"Unknown dataset {dataset}") - + if "post_process" not in data: data["post_process"] = default_post_process - + return data diff --git a/evals/evaluation/HELMET/eval.py b/evals/evaluation/HELMET/eval.py index 557411e8..e33a6304 100644 --- a/evals/evaluation/HELMET/eval.py +++ b/evals/evaluation/HELMET/eval.py @@ -1,26 +1,22 @@ -import os +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 -from collections import defaultdict -import random import json +import logging +import os +import random import time +from collections import defaultdict -from tqdm import tqdm import numpy as np import torch -from torch.utils.data import DataLoader - from arguments import parse_arguments +from data import TestItemDataset, load_data from model_utils import load_LLM +from torch.utils.data import DataLoader +from tqdm import tqdm -from data import ( - load_data, - TestItemDataset, -) - -import logging -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S') +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -33,7 +29,10 @@ def run_test(args, model, dataset, test_file, demo_file): tag += f"_pop{args.popularity_threshold}" test_name = os.path.splitext(os.path.basename(test_file))[0] - output_path = os.path.join(args.output_dir, f"{dataset}_{tag}_{test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json") + output_path = os.path.join( + args.output_dir, + f"{dataset}_{tag}_{test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json", + ) if os.path.exists(output_path) and not args.overwrite and not args.debug: logger.info(f"{output_path} already exists, skipping...") return output_path @@ -43,9 +42,9 @@ def run_test(args, model, dataset, test_file, demo_file): logger.info(f"loaded {len(data['data'])} samples from {dataset}") dataloader = DataLoader( - TestItemDataset(data, model, model.tokenizer), - batch_size=1, - shuffle=False, + TestItemDataset(data, model, model.tokenizer), + batch_size=1, + shuffle=False, collate_fn=lambda x: x, num_workers=args.num_workers if not args.debug else 0, ) @@ -56,24 +55,24 @@ def run_test(args, model, dataset, test_file, demo_file): with torch.inference_mode(): for idx, inputs in enumerate(tqdm(dataloader)): test_item = data["data"][idx] - inputs, input_text = inputs[0] # batch size is just 1 + inputs, input_text = inputs[0] # batch size is just 1 if args.count_tokens: metrics["input_len"].append(inputs.input_ids.shape[1]) continue - + output = model.generate(inputs=inputs) if output is None: logger.info(f"skipping example {idx+1} because the model returned None") continue - # If we do not use the chat template, then we are doing completion, and for the sake of parsing, we want to prepend the system prompt to the input. + # If we do not use the chat template, then we are doing completion, and for the sake of parsing, we want to prepend the system prompt to the input. # For example, since we are autocompleting "Answer:"" in the input, then we should prepend the system prompt to the output as well. # This requires some coordination from the dataset preprocessing if not args.use_chat_template: prepend_text = data["system_template"].format(**test_item) output["output"] = prepend_text + output["output"] - - mets, others = data['post_process'](output, test_item) + + mets, others = data["post_process"](output, test_item) output.update({**others, **mets}) for k, v in mets.items(): metrics[k].append(v) @@ -84,7 +83,7 @@ def run_test(args, model, dataset, test_file, demo_file): result.pop("context", None) result.pop("input_ids", None) if input_text is None: - input_text = result['input_text'] + input_text = result["input_text"] results.append(result) # print out some examples, we also limit how much we print out since it can get really long @@ -98,9 +97,11 @@ def run_test(args, model, dataset, test_file, demo_file): logger.info(f"Answer: {test_item['answer'] if 'answer' in test_item else ''}") logger.info(f"Output: {output['output']}") logger.info(f"Parsed output: {output['parsed_output']}") - + if args.debug: - import pdb; pdb.set_trace() + import pdb + + pdb.set_trace() output = None @@ -110,14 +111,16 @@ def run_test(args, model, dataset, test_file, demo_file): logger.info(f"Throughput: {len(results) / (end_time - start_time):.02f} samples/s") if args.count_tokens: - logger.info(f"----{dataset}----\nAverage input length: {np.mean(metrics['input_len']):.02f}, std input length: {np.std(metrics['input_len']):.02f}, max input length: {max(metrics['input_len'])}, min input length: {min(metrics['input_len'])}\n----returning----") + logger.info( + f"----{dataset}----\nAverage input length: {np.mean(metrics['input_len']):.02f}, std input length: {np.std(metrics['input_len']):.02f}, max input length: {max(metrics['input_len'])}, min input length: {min(metrics['input_len'])}\n----returning----" + ) return output_path if len(results) == 0: logger.error("No results to evaluate, something went wrong, returning...") return output_path - averaged_metrics = {k: np.mean(v)*(100 if "_len" not in k else 1) for k, v in metrics.items()} + averaged_metrics = {k: np.mean(v) * (100 if "_len" not in k else 1) for k, v in metrics.items()} logger.info("Averaged metrics:") for k, v in averaged_metrics.items(): @@ -136,7 +139,7 @@ def run_test(args, model, dataset, test_file, demo_file): with open(output_path, "w") as f: json.dump(output, f, indent=4) # this makes it easier to parse results, but alce uses a different evaluation script - if not "alce" in dataset: + if "alce" not in dataset: with open(output_path + ".score", "w") as f: json.dump(output["averaged_metrics"], f, indent=4) logger.info(f"done, results are written to {output_path}") @@ -160,11 +163,21 @@ def main(): datasets = args.datasets.split(",") test_files = args.test_files.split(",") demo_files = args.demo_files.split(",") - max_lengths = ([int(args.input_max_length)] * len(datasets)) if isinstance(args.input_max_length, int) or len(args.input_max_length.split(",")) == 1 else [int(l) for l in args.input_max_length.split(",")] - gen_lengths = ([int(args.generation_max_length)] * len(datasets)) if isinstance(args.generation_max_length, int) or len(args.generation_max_length.split(",")) == 1 else [int(l) for l in args.generation_max_length.split(",")] + max_lengths = ( + ([int(args.input_max_length)] * len(datasets)) + if isinstance(args.input_max_length, int) or len(args.input_max_length.split(",")) == 1 + else [int(l) for l in args.input_max_length.split(",")] + ) + gen_lengths = ( + ([int(args.generation_max_length)] * len(datasets)) + if isinstance(args.generation_max_length, int) or len(args.generation_max_length.split(",")) == 1 + else [int(l) for l in args.generation_max_length.split(",")] + ) assert len(test_files) == len(demo_files) - for dataset, test_file, demo_file, max_length, gen_length in zip(datasets, test_files, demo_files, max_lengths, gen_lengths): + for dataset, test_file, demo_file, max_length, gen_length in zip( + datasets, test_files, demo_files, max_lengths, gen_lengths + ): args.datasets = dataset args.test_files = test_file args.demo_files = demo_file @@ -173,14 +186,19 @@ def main(): model.max_length = max_length model.generation_max_length = gen_length - try: + try: output_path = run_test(args, model, dataset, test_file, demo_file) - if "alce" in dataset and not args.count_tokens and (not os.path.exists(output_path+".score") or args.overwrite): + if ( + "alce" in dataset + and not args.count_tokens + and (not os.path.exists(output_path + ".score") or args.overwrite) + ): import eval_alce + logger.info("running eval_alce.py...") cli_args = ["--f", output_path] - if not "nocite" in dataset: + if "nocite" not in dataset: cli_args.append("--citations") if "asqa" in dataset: cli_args.append("--mauve") @@ -189,12 +207,12 @@ def main(): eval_alce.main(cli_args) except Exception as e: - # in case we run into some kind of error + # in case we run into some kind of error logger.exception(e) logger.error(f"Error in {dataset}, continuing...") if args.debug: raise e + if __name__ == "__main__": main() - diff --git a/evals/evaluation/HELMET/eval_alce.py b/evals/evaluation/HELMET/eval_alce.py index de9868e0..3b6e2b82 100644 --- a/evals/evaluation/HELMET/eval_alce.py +++ b/evals/evaluation/HELMET/eval_alce.py @@ -1,33 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse import collections +import copy import json +import logging import re import string -import torch -import copy +import sys +from collections import defaultdict -from nltk import sent_tokenize import numpy as np +import torch +from nltk import sent_tokenize from rouge_score import rouge_scorer, scoring from tqdm import tqdm -import sys -import logging -from collections import defaultdict -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S') + +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -from transformers import ( - AutoModelForSeq2SeqLM, - AutoTokenizer, - pipeline -) - -from utils import normalize_answer, get_max_memory, remove_citations +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline +from utils import get_max_memory, normalize_answer, remove_citations -QA_MODEL="gaotianyu1350/roberta-large-squad" -AUTOAIS_MODEL="google/t5_xxl_true_nli_mixture" +QA_MODEL = "gaotianyu1350/roberta-large-squad" +AUTOAIS_MODEL = "google/t5_xxl_true_nli_mixture" global autoais_model, autoais_tokenizer autoais_model, autoais_tokenizer = None, None @@ -69,6 +67,7 @@ def compute_exact(a_gold, a_pred): def exact_presence(short_answers, context): """Verify if any of the answers is present in the given context. + Args: short_answers: list of short answers to look for in the context context: a paragraph to search for short answers @@ -88,6 +87,7 @@ def exact_presence(short_answers, context): def compute_rouge(data): """Main function for rouge scoring. + If two references are provided, the best score is chosen for each instance. Args: @@ -96,10 +96,8 @@ def compute_rouge(data): Returns: dictionary representation of rouge scores """ - def _rouge_calculation(hypotheses, - references1, - references2=[], - metrics=['rougeLsum']): + + def _rouge_calculation(hypotheses, references1, references2=[], metrics=["rougeLsum"]): if references2 == []: references2 = references1 @@ -110,7 +108,7 @@ def _rouge_calculation(hypotheses, for i in range(len(hypotheses)): scores1 = scorer.score(references1[i], hypotheses[i]) scores2 = scorer.score(references2[i], hypotheses[i]) - if scores1['rougeLsum'].fmeasure > scores2['rougeLsum'].fmeasure: + if scores1["rougeLsum"].fmeasure > scores2["rougeLsum"].fmeasure: aggregator.add_scores(scores1) else: aggregator.add_scores(scores2) @@ -132,7 +130,7 @@ def _rouge_calculation(hypotheses, for idx, item in enumerate(data): hypotheses[idx] = item["output"] - if "annotations" in item and item['annotations'] is not None: # For ASQA + if "annotations" in item and item["annotations"] is not None: # For ASQA references1[idx] = item["annotations"][0]["long_answer"] references2[idx] = item["annotations"][1]["long_answer"] else: @@ -148,12 +146,12 @@ def _rouge_calculation(hypotheses, if references2 is not None: r2.append(references2[key]) - h = ['\n'.join(sent_tokenize(text.lower())) for text in h] - r1 = ['\n'.join(sent_tokenize(text.lower())) for text in r1] - r2 = ['\n'.join(sent_tokenize(text.lower())) for text in r2] + h = ["\n".join(sent_tokenize(text.lower())) for text in h] + r1 = ["\n".join(sent_tokenize(text.lower())) for text in r1] + r2 = ["\n".join(sent_tokenize(text.lower())) for text in r2] scores = _rouge_calculation(h, r1, r2) - return scores['rougeLsum'] + return scores["rougeLsum"] def compute_str_em(data): @@ -164,7 +162,7 @@ def compute_str_em(data): STR-EM and STR-EM-HIT () """ - if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None: + if "qa_pairs" not in data[0] or data[0]["qa_pairs"] is None: return 0, 0 acc = [] @@ -172,10 +170,10 @@ def compute_str_em(data): for item in data: loc_acc = [] - for qa_pair in item['qa_pairs']: - loc_acc.append(exact_presence(qa_pair['short_answers'], item["output"])) + for qa_pair in item["qa_pairs"]: + loc_acc.append(exact_presence(qa_pair["short_answers"], item["output"])) acc.append(np.mean(loc_acc)) - hit.append( int(np.mean(loc_acc) == 1) ) + hit.append(int(np.mean(loc_acc) == 1)) return 100 * np.mean(acc), 100 * np.mean(hit) @@ -192,18 +190,19 @@ def compute_len(data): def compute_qa(data): """Compute QA-based accuracy. + Args: data: requires filed `qa_pairs/short_answers` and `output` Returns: QA metrics (QA-EM, QA-F1, QA-Hit) """ - if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None: + if "qa_pairs" not in data[0] or data[0]["qa_pairs"] is None: logger.warn("Warning: no QA pairs found in data") return { - 'QA-EM': 0, - 'QA-F1': 0, - 'QA-Hit': 0, + "QA-EM": 0, + "QA-F1": 0, + "QA-Hit": 0, } # Load model @@ -215,8 +214,8 @@ def compute_qa(data): logger.info("Computing the QA-based accuracy...") em, f1, bins = [], [], [] for item in tqdm(data): - question = [qa_pair['question'] for qa_pair in item['qa_pairs']] - context = item['output'] if len(item['output']) > 0 else " " + question = [qa_pair["question"] for qa_pair in item["qa_pairs"]] + context = item["output"] if len(item["output"]) > 0 else " " results = qa_pipeline(question=question, context=context, handle_impossible_answer=True) loc_counter, loc_em, loc_f1 = 0, 0, 0 @@ -232,11 +231,7 @@ def compute_qa(data): f1.append(loc_f1 / loc_counter) bins.append(loc_em == loc_counter) - return { - 'QA-EM': 100 * np.mean(em), - 'QA-F1': 100 * np.mean(f1), - 'QA-Hit': 100 * np.mean(bins) - } + return {"QA-EM": 100 * np.mean(em), "QA-F1": 100 * np.mean(f1), "QA-Hit": 100 * np.mean(bins)} def compute_mauve(data): @@ -249,10 +244,15 @@ def compute_mauve(data): # Remove ending punctuations # Remove any new lines # Truncate by 100 words - human_data.append(' '.join((item['question'] + " " + item['answer'].strip()).split()[:100]).rstrip(string.punctuation)) - model_data.append(' '.join((item['question'] + " " + item['output'].strip()).split()[:100]).rstrip(string.punctuation)) + human_data.append( + " ".join((item["question"] + " " + item["answer"].strip()).split()[:100]).rstrip(string.punctuation) + ) + model_data.append( + " ".join((item["question"] + " " + item["output"].strip()).split()[:100]).rstrip(string.punctuation) + ) import mauve + out = mauve.compute_mauve( p_text=human_data, q_text=model_data, @@ -260,14 +260,14 @@ def compute_mauve(data): max_text_length=512, verbose=True, batch_size=8, - featurize_model_name="gpt2-large" + featurize_model_name="gpt2-large", ) return out.mauve * 100 def _run_nli_autoais(passage, claim): - """ - Run inference for assessing AIS between a premise and hypothesis. + """Run inference for assessing AIS between a premise and hypothesis. + Adapted from https://github.com/google-research-datasets/Attributed-QA/blob/main/evaluation.py """ global autoais_model, autoais_tokenizer @@ -284,13 +284,15 @@ def compute_claims(data): global autoais_model, autoais_tokenizer if autoais_model is None: logger.info("Loading AutoAIS model...") - autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto") + autoais_model = AutoModelForSeq2SeqLM.from_pretrained( + AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto" + ) autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False) logger.info("Computing claims...") scores = [] for item in tqdm(data): - normalized_output = remove_citations(item['output']) + normalized_output = remove_citations(item["output"]) entail = 0 claims = item["claims"] for claim in claims: @@ -299,13 +301,14 @@ def compute_claims(data): return 100 * np.mean(scores) -def compute_autoais(data, - decontext=False, - concat=False, - qampari=False, - at_most_citations=None,): - """ - Compute AutoAIS score. +def compute_autoais( + data, + decontext=False, + concat=False, + qampari=False, + at_most_citations=None, +): + """Compute AutoAIS score. Args: data: requires field `output` and `docs` @@ -317,7 +320,9 @@ def compute_autoais(data, global autoais_model, autoais_tokenizer if autoais_model is None: logger.info("Loading AutoAIS model...") - autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto") + autoais_model = AutoModelForSeq2SeqLM.from_pretrained( + AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto" + ) autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False) logger.info(f"Running AutoAIS...") @@ -327,9 +332,9 @@ def _format_document(doc): if "sent" in doc: # QA-extracted docs - return "Title: %s\n%s" % (doc['title'], doc['sent']) + return "Title: %s\n%s" % (doc["title"], doc["sent"]) else: - return "Title: %s\n%s" % (doc['title'], doc['text']) + return "Title: %s\n%s" % (doc["title"], doc["text"]) ais_scores = [] ais_scores_prec = [] @@ -343,9 +348,11 @@ def _format_document(doc): for item in tqdm(data): # Get sentences by using NLTK if qampari: - sents = [item['question'] + " " + x.strip() for x in item['output'].rstrip().rstrip(".").rstrip(",").split(",")] + sents = [ + item["question"] + " " + x.strip() for x in item["output"].rstrip().rstrip(".").rstrip(",").split(",") + ] else: - sents = sent_tokenize(item['output']) + sents = sent_tokenize(item["output"]) # we also ignore sentences that are < 5 characters long, they are unlikely to be meaningful # this resolves the case where the sentencizer takes "1." as a sentence sents = [x for x in sents if len(x.strip()) >= 5] @@ -358,37 +365,39 @@ def _format_document(doc): entail_prec = 0 total_citations = 0 for sent_id, sent in enumerate(sents): - target_sent = target_sents[sent_id] # Citation removed and (if opted for) decontextualized - joint_entail = -1 # Undecided + target_sent = target_sents[sent_id] # Citation removed and (if opted for) decontextualized + joint_entail = -1 # Undecided # Find references - ref = [int(r[1:])-1 for r in re.findall(r"\[\d+", sent)] # In text citation id starts from 1 + ref = [int(r[1:]) - 1 for r in re.findall(r"\[\d+", sent)] # In text citation id starts from 1 for r in ref: citation_position_count[r] += 1 logger.info(f"For `{sent}`, find citations {ref}") if len(ref) == 0: # No citations joint_entail = 0 - elif any([ref_id >= len(item['docs']) for ref_id in ref]): + elif any([ref_id >= len(item["docs"]) for ref_id in ref]): # Citations out of range joint_entail = 0 else: if at_most_citations is not None: ref = ref[:at_most_citations] total_citations += len(ref) - joint_passage = '\n'.join([_format_document(item['docs'][psgs_id]) for psgs_id in ref]) + joint_passage = "\n".join([_format_document(item["docs"][psgs_id]) for psgs_id in ref]) # If not directly rejected by citation format error, calculate the recall score if joint_entail == -1: joint_entail = _run_nli_autoais(joint_passage, target_sent) - autoais_log.append({ - "question": item['question'], - "output": item['output'], - "claim": sent, - "passage": [joint_passage], - "model_type": "NLI", - "model_output": joint_entail, - }) + autoais_log.append( + { + "question": item["question"], + "output": item["output"], + "claim": sent, + "passage": [joint_passage], + "model_type": "NLI", + "model_output": joint_entail, + } + ) entail += joint_entail if len(ref) > 1: @@ -400,16 +409,16 @@ def _format_document(doc): # Precision check: did the model cite any unnecessary documents? for psgs_id in ref: # condition A - passage = _format_document(item['docs'][psgs_id]) + passage = _format_document(item["docs"][psgs_id]) nli_result = _run_nli_autoais(passage, target_sent) # condition B if not nli_result: subset_exclude = copy.deepcopy(ref) subset_exclude.remove(psgs_id) - passage = '\n'.join([_format_document(item['docs'][pid]) for pid in subset_exclude]) + passage = "\n".join([_format_document(item["docs"][pid]) for pid in subset_exclude]) nli_result = _run_nli_autoais(passage, target_sent) - if nli_result: # psgs_id is not necessary + if nli_result: # psgs_id is not necessary flag = 0 sent_mcite_overcite += 1 else: @@ -421,14 +430,17 @@ def _format_document(doc): sent_total += len(sents) ais_scores.append(entail / len(sents)) - ais_scores_prec.append(entail_prec / total_citations if total_citations > 0 else 0) # len(sents)) + ais_scores_prec.append(entail_prec / total_citations if total_citations > 0 else 0) # len(sents)) if sent_mcite > 0 and sent_mcite_support > 0: - print("Among all sentences, %.2f%% have multiple citations, among which %.2f%% are supported by the joint set, among which %.2f%% overcite." % ( - 100 * sent_mcite / sent_total, - 100 * sent_mcite_support / sent_mcite, - 100 * sent_mcite_overcite / sent_mcite_support - )) + print( + "Among all sentences, %.2f%% have multiple citations, among which %.2f%% are supported by the joint set, among which %.2f%% overcite." + % ( + 100 * sent_mcite / sent_total, + 100 * sent_mcite_support / sent_mcite, + 100 * sent_mcite_overcite / sent_mcite_support, + ) + ) return { "citation_rec": 100 * np.mean(ais_scores) if len(ais_scores) > 0 else 0, @@ -447,16 +459,16 @@ def compute_qampari_f1(data, cot=False): num_preds = [] for item in data: if cot: - if ":" in item['output']: - o = ':'.join(item['output'].split(":")[1:]) # try to separate the COT part and the answer list part. + if ":" in item["output"]: + o = ":".join(item["output"].split(":")[1:]) # try to separate the COT part and the answer list part. else: o = "" else: - o = item['output'] + o = item["output"] preds = [normalize_answer(x.strip()) for x in o.rstrip().rstrip(".").rstrip(",").split(",")] - preds = [p for p in preds if len(p) > 0] # delete empty answers + preds = [p for p in preds if len(p) > 0] # delete empty answers num_preds.append(len(preds)) - answers = [[normalize_answer(x) for x in ans] for ans in item['answers']] + answers = [[normalize_answer(x) for x in and] for and in item["answers"]] flat_answers = [item for sublist in answers for item in sublist] prec.append(sum([p in flat_answers for p in preds]) / len(preds) if len(preds) > 0 else 0) @@ -480,19 +492,29 @@ def compute_qampari_f1(data, cot=False): "qampari_f1_top5": 100 * np.mean(f1_top5), } + def main(args=None): parser = argparse.ArgumentParser() - parser.add_argument("--f", type=str, required=True, help="Output file. Should have field `question`, `output`, (ROUGE) `answer`, \ - (accuracy) `qa_pairs`, (AIS) `docs`") + parser.add_argument( + "--f", + type=str, + required=True, + help="Output file. Should have field `question`, `output`, (ROUGE) `answer`, \ + (accuracy) `qa_pairs`, (AIS) `docs`", + ) parser.add_argument("--no_rouge", action="store_true", help="Do not evaluate ROUGE score") parser.add_argument("--qa", action="store_true", help="Use the QA model") parser.add_argument("--mauve", action="store_true", help="Use the mauve score model") parser.add_argument("--citations", action="store_true", help="Evaluation with citation") - parser.add_argument("--at_most_citations", type=int, default=3, help="At most take this many documents (mostly for precision)") + parser.add_argument( + "--at_most_citations", type=int, default=3, help="At most take this many documents (mostly for precision)" + ) parser.add_argument("--claims_nli", action="store_true", help="Use claims for ELI5") # QAMPARI - parser.add_argument("--cot", action="store_true", help="For QAMPARI, try to find colon and separate the COT and answer listing") + parser.add_argument( + "--cot", action="store_true", help="For QAMPARI, try to find colon and separate the COT and answer listing" + ) if args is None: args = parser.parse_args() @@ -501,7 +523,7 @@ def main(args=None): with open(args.f) as f: data_with_config = json.load(f) - data = data_with_config['data'] + data = data_with_config["data"] if "qampari" in args.f: args.no_rouge = True @@ -518,26 +540,25 @@ def main(args=None): logger.warning("We replace any on the fly search result to standard bracket citation format.") for i in range(len(data)): # data[i]['output'] = data[i]['output'].strip().split("\n")[0] - data[i]['output'] = re.sub(r"\n+", " ", data[i]['output']) - data[i]['output'] = data[i]['output'].replace("<|im_end|>", "") - + data[i]["output"] = re.sub(r"\n+", " ", data[i]["output"]) + data[i]["output"] = data[i]["output"].replace("<|im_end|>", "") # Remove all citations for all non-AutoAIS evaluation normalized_data = copy.deepcopy(data) for i in range(len(normalized_data)): - normalized_data[i]['output'] = remove_citations(normalized_data[i]['output']) + normalized_data[i]["output"] = remove_citations(normalized_data[i]["output"]) result = {} - result['length'] = compute_len(normalized_data) - result['str_em'], result['str_hit'] = compute_str_em(normalized_data) + result["length"] = compute_len(normalized_data) + result["str_em"], result["str_hit"] = compute_str_em(normalized_data) if qampari: result.update(compute_qampari_f1(normalized_data, cot=args.cot)) if not args.no_rouge: - result['rougeLsum'] = compute_rouge(normalized_data) + result["rougeLsum"] = compute_rouge(normalized_data) if args.qa: result.update(compute_qa(normalized_data)) if args.mauve: - result['mauve'] = compute_mauve(normalized_data) + result["mauve"] = compute_mauve(normalized_data) if args.citations: result.update(compute_autoais(data, qampari=qampari, at_most_citations=args.at_most_citations)) if args.claims_nli: diff --git a/evals/evaluation/HELMET/model_utils.py b/evals/evaluation/HELMET/model_utils.py index 78465c42..30ee2e27 100644 --- a/evals/evaluation/HELMET/model_utils.py +++ b/evals/evaluation/HELMET/model_utils.py @@ -1,12 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import functools +import logging import os import time import torch from transformers import PreTrainedTokenizer -import functools -import logging -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S') + +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -21,6 +24,7 @@ def format_chat(message, include_system=False, system_message="You are a helpful chat = [{"role": "user", "content": message}] return chat + def call_api(func, limit=5, pause=10): count = 0 while True: @@ -29,7 +33,12 @@ def call_api(func, limit=5, pause=10): break except Exception as e: logger.info(f"Exception while using api: {e}") - if "rate limit" in str(e).lower() or "rate_limit" in str(e).lower() or "quota" in str(e).lower() or "429" in str(e): + if ( + "rate limit" in str(e).lower() + or "rate_limit" in str(e).lower() + or "quota" in str(e).lower() + or "429" in str(e) + ): logger.info(f"Rate limit exceeded, waiting {pause} secs and retrying...") time.sleep(pause) elif count < limit: @@ -41,6 +50,7 @@ def call_api(func, limit=5, pause=10): break return output + class LLM: def __init__( self, @@ -68,17 +78,17 @@ def __init__( def prepare_inputs(self, test_item, data): raise NotImplementedError("prepare_inputs not implemented for LLM") - + def generate(self, inputs=None, prompt=None, **kwargs): raise NotImplementedError("generate not implemented for LLM") class OpenAIModel(LLM): def __init__( - self, - model_name, - temperature=0.9, - top_p=0.9, + self, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -86,11 +96,11 @@ def __init__( stop_newline=False, use_chat_template=True, **kwargs, - ): + ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -100,21 +110,24 @@ def __init__( ) import openai import tiktoken + if "azure" in model_name: - # env var: AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and OPENAI_API_VERSION + # env var: AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and OPENAI_API_VERSION self.model = openai.AzureOpenAI() - model_name = model_name[model_name.index("/")+1:] + model_name = model_name[model_name.index("/") + 1 :] else: # make sure to set the OPENAI_API_KEY environment variable self.model = openai.OpenAI() self.model_name = model_name self.tokenizer = tiktoken.encoding_for_model(model_name) - def prepare_inputs(self, test_item, data): buffer = 100 # we don't include system message to stay consistent with other models - prompt = format_chat(data["user_template"].format(**test_item), include_system=False,) + prompt = format_chat( + data["user_template"].format(**test_item), + include_system=False, + ) inputs = "\n".join([f"Role: {x['role']}\nContent: {x['content']}" for x in prompt]) tokens = self.tokenizer.encode(inputs) input_len = len(tokens) @@ -129,7 +142,7 @@ def prepare_inputs(self, test_item, data): new_context = self.tokenizer.decode(self.tokenizer.encode(test_item["context"])[:-truncate_length]) test_item["context"] = new_context prompt = format_chat(data["user_template"].format(**test_item), include_system=False) - return prompt + return prompt """ inputs: list[str] @@ -137,15 +150,16 @@ def prepare_inputs(self, test_item, data): prompt: str the user message to be sent to the model """ + def generate(self, inputs=None, prompt=None, system_message="You are a helpful assistant", **kwargs): if inputs is None: inputs = format_chat(prompt, include_system=True, system_message=system_message) - + # kwargs can be used to pass additional parameters to the model: max_tokens, stop, etc. func = functools.partial( - self.model.chat.completions.create, - model=self.model_name, - messages=inputs, + self.model.chat.completions.create, + model=self.model_name, + messages=inputs, max_tokens=self.generation_max_length, temperature=self.temperature if self.do_sample else 0.0, top_p=self.top_p, @@ -155,7 +169,7 @@ def generate(self, inputs=None, prompt=None, system_message="You are a helpful a output = call_api(func) if output is not None: if output.choices[0].message.content is None: - # sometimes the model output can get filtered but sitll return a message + # sometimes the model output can get filtered but still return a message return None return { "output": output.choices[0].message.content, @@ -165,12 +179,13 @@ def generate(self, inputs=None, prompt=None, system_message="You are a helpful a } return None + class AnthropicModel(LLM): def __init__( - self, - model_name, - temperature=0.9, - top_p=0.9, + self, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -178,11 +193,11 @@ def __init__( stop_newline=False, use_chat_template=True, **kwargs, - ): + ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -191,10 +206,11 @@ def __init__( use_chat_template=use_chat_template, ) from anthropic import Anthropic, AnthropicVertex + if "vertex" in model_name: # region defaults to env var CLOUD_ML_REGION and project_id defaults to ANTHROPIC_VERTEX_PROJECT_ID self.model = AnthropicVertex() - model_name = model_name[model_name.index("/")+1:] + model_name = model_name[model_name.index("/") + 1 :] else: # remember to set ANTHROPIC_API_KEY environment variable (the default) self.model = Anthropic() @@ -207,14 +223,13 @@ def __init__( self.generation_max_length = generation_max_length self.do_sample = do_sample self.stops = None - if stop_newline: # claude does not support newline + if stop_newline: # claude does not support newline pass - def prepare_inputs(self, test_item, data): buffer = 100 prompt = format_chat( - data["user_template"].format(**test_item), + data["user_template"].format(**test_item), include_system=False, ) inputs = "\n".join([f"Role: {x['role']}\nContent: {x['content']}" for x in prompt]) @@ -224,14 +239,13 @@ def prepare_inputs(self, test_item, data): if input_len > self.max_length - self.generation_max_length - buffer: truncate_length = input_len - (self.max_length - self.generation_max_length - buffer) tokens = self.tokenizer.encode(test_item["context"]) - new_context = test_item["context"][:tokens.offsets[-truncate_length-1][1]] + new_context = test_item["context"][: tokens.offsets[-truncate_length - 1][1]] test_item["context"] = new_context prompt = format_chat( - data["user_template"].format(**test_item), + data["user_template"].format(**test_item), include_system=False, ) return prompt - """ inputs: list[str] @@ -239,19 +253,20 @@ def prepare_inputs(self, test_item, data): prompt: str the user message to be sent to the model """ + def generate(self, inputs=None, prompt=None, **kwargs): if inputs is None: inputs = format_chat(prompt, include_system=False) - + # kwargs can be used to pass additional parameters to the model: max_tokens, stop, etc. # Note: in the original paper, we used this system message: # system="You are a helpful assistant. Make sure your output does not contain new lines." - # To be consistent with the other models, and for future compability, we remove the system message + # To be consistent with the other models, and for future compatibility, we remove the system message # We don't expect this to make a significant difference in the results func = functools.partial( self.model.messages.create, - model=self.model_name, - messages=inputs, + model=self.model_name, + messages=inputs, max_tokens=self.generation_max_length, temperature=self.temperature if self.do_sample else 0.0, top_p=self.top_p, @@ -272,10 +287,10 @@ def generate(self, inputs=None, prompt=None, **kwargs): class GeminiModel(LLM): def __init__( - self, - model_name, - temperature=0.9, - top_p=0.9, + self, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -283,11 +298,11 @@ def __init__( stop_newline=False, use_chat_template=True, **kwargs, - ): + ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -297,12 +312,15 @@ def __init__( ) import google.generativeai as genai + # default env var GOOGLE_API_KEY genai.configure(api_key=os.environ.get("GOOGLE_API_KEY")) import vertexai - vertexai.init() # make sure to set the env var appropriately + + vertexai.init() # make sure to set the env var appropriately from vertexai.preview.tokenization import get_tokenizer_for_model + self.model = genai.GenerativeModel(model_name) self.tokenizer = get_tokenizer_for_model(model_name) self.model_name = model_name @@ -318,30 +336,31 @@ def prepare_inputs(self, test_item, data): truncate_length = input_len - (max_length - self.generation_max_length - buffer) # not the most pretty way of doing this but it works... # the documentation doesn't provide an official way to truncate - new_context = self.tokenizer._sentencepiece_adapter._tokenizer.decode(self.tokenizer.compute_tokens(test_item["context"]).token_info_list[0].token_ids[:-truncate_length]) - test_item['context'] = new_context + new_context = self.tokenizer._sentencepiece_adapter._tokenizer.decode( + self.tokenizer.compute_tokens(test_item["context"]).token_info_list[0].token_ids[:-truncate_length] + ) + test_item["context"] = new_context prompt = data["prompt_template"].format(**test_item) - + return prompt def generate(self, inputs=None, prompt=None, **kwargs): import google.generativeai as genai + if inputs is None: inputs = prompt - - generation_config = genai.GenerationConfig(temperature=self.temperature, top_p=self.top_p, max_output_tokens=self.generation_max_length) - func = functools.partial( - self.model.generate_content, - contents=inputs, - generation_config=generation_config + + generation_config = genai.GenerationConfig( + temperature=self.temperature, top_p=self.top_p, max_output_tokens=self.generation_max_length ) + func = functools.partial(self.model.generate_content, contents=inputs, generation_config=generation_config) output = call_api(func, pause=15) if output is not None: try: # can probably check the output for errors but it's not well documented output.text except Exception as e: - logger.error(f"Error in output: {output}; {e}") + logger.error(f"Error in output: {output}; {e}") return None return { @@ -356,9 +375,9 @@ def generate(self, inputs=None, prompt=None, **kwargs): class TogetherModel(LLM): def __init__( self, - model_name, - temperature=0.9, - top_p=0.9, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -368,9 +387,9 @@ def __init__( **kwargs, ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -379,19 +398,20 @@ def __init__( use_chat_template=use_chat_template, ) - from transformers import AutoTokenizer from together import Together + from transformers import AutoTokenizer + # default env var TOGETHER_API_KEY self.model = Together() # should change this to be more flexible in the future lol self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-405B-Instruct") self.model_name = model_name.replace("togetherapi/", "") - + def prepare_inputs(self, test_item, data): buffer = 100 prompt = format_chat( - data["user_template"].format(**test_item), - system_message=data.get("system_message", "You are a helpful assistant.") + data["user_template"].format(**test_item), + system_message=data.get("system_message", "You are a helpful assistant."), ) tokens = self.tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True) input_len = len(tokens) @@ -400,14 +420,14 @@ def prepare_inputs(self, test_item, data): if input_len > max_length - self.generation_max_length - buffer: truncate_length = input_len - (max_length - self.generation_max_length - buffer) context_tokens = self.tokenizer(test_item["context"], return_offsets_mapping=True) - new_context = test_item["context"][:context_tokens["offset_mapping"][-truncate_length][0]] - + new_context = test_item["context"][: context_tokens["offset_mapping"][-truncate_length][0]] + test_item["context"] = new_context prompt = format_chat( - data["user_template"].format(**test_item), - system_message=data.get("system_message", "You are a helpful assistant.") + data["user_template"].format(**test_item), + system_message=data.get("system_message", "You are a helpful assistant."), ) - return prompt + return prompt """ inputs: list[str] @@ -415,15 +435,16 @@ def prepare_inputs(self, test_item, data): prompt: str the user message to be sent to the model """ + def generate(self, inputs=None, prompt=None, system_message="You are a helpful assistant", **kwargs): if inputs is None: inputs = format_chat(prompt, include_system=True, system_message=system_message) - + # kwargs can be used to pass additional parameters to the model: max_tokens, stop, etc. func = functools.partial( - self.model.chat.completions.create, - model=self.model_name, - messages=inputs, + self.model.chat.completions.create, + model=self.model_name, + messages=inputs, max_tokens=self.generation_max_length, temperature=self.temperature if self.do_sample else 0.0, top_p=self.top_p, @@ -433,7 +454,7 @@ def generate(self, inputs=None, prompt=None, system_message="You are a helpful a output = call_api(func) if output is not None: if output.choices[0].message.content is None: - # sometimes the model output can get filtered but sitll return a message + # sometimes the model output can get filtered but still return a message return None return { "output": output.choices[0].message.content, @@ -448,15 +469,15 @@ def tokenize(sample, data, tokenizer, max_length, generation_max_length, use_cha def format_input(sample): if use_chat_template: chat = format_chat( - data["user_template"].format(**sample), + data["user_template"].format(**sample), include_system=False, - system_message=data.get("system_message", "You are a helpful assistant.") + system_message=data.get("system_message", "You are a helpful assistant."), ) try: prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) except Exception as e: chat = format_chat( - data["user_template"].format(**sample), + data["user_template"].format(**sample), include_system=False, ) prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) @@ -466,12 +487,12 @@ def format_input(sample): prompt = data["prompt_template"].format(**sample) tokenized_input = tokenizer([prompt], return_tensors="pt") return tokenized_input - + if "Phi3SmallTokenizer" in str(type(tokenizer)): - buffer = 64 if max_length == 131072 else 0 # there is some problem with their rotary emb implementation + buffer = 64 if max_length == 131072 else 0 # there is some problem with their rotary emb implementation else: buffer = 0 - + tokenized_input = format_input(sample) if tokenized_input.input_ids.size(1) > max_length - generation_max_length - buffer: truncate_length = tokenized_input.input_ids.size(1) - (max_length - generation_max_length - buffer) @@ -482,7 +503,7 @@ def format_input(sample): new_context = tokenizer.decode(context_tokens["input_ids"][:-truncate_length]) else: context_tokens = tokenizer([sample["context"]], return_offsets_mapping=True) - new_context = sample["context"][:context_tokens["offset_mapping"][0][-truncate_length][0]] + new_context = sample["context"][: context_tokens["offset_mapping"][0][-truncate_length][0]] sample["context"] = new_context tokenized_input = format_input(sample) @@ -491,10 +512,10 @@ def format_input(sample): class HFModel(LLM): def __init__( - self, - model_name, - temperature=0.9, - top_p=0.9, + self, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -504,9 +525,9 @@ def __init__( **kwargs, ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -516,9 +537,11 @@ def __init__( ) import transformers - from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoConfig + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig + model_kwargs = {} from pkg_resources import parse_version + if parse_version(transformers.__version__) <= parse_version("4.34.1"): model_kwargs["use_flash_attention_2"] = True else: @@ -539,14 +562,14 @@ def __init__( if "rope_theta" in kwargs and kwargs["rope_theta"] is not None: logger.info(f"Override rope theta to {kwargs['rope_theta']}") config.rope_theta = kwargs["rope_theta"] - + self.model = AutoModelForCausalLM.from_pretrained( - model_name, + model_name, config=config, torch_dtype=kwargs.get("torch_dtype", torch.bfloat16), device_map="auto", trust_remote_code=True, - **model_kwargs + **model_kwargs, ) if kwargs.get("torch_compile", True): self.model = torch.compile(self.model) @@ -556,7 +579,9 @@ def __init__( stop_token_ids = [stop_token_ids] if not isinstance(stop_token_ids, list) else stop_token_ids if stop_newline: stop = list(set(["\n", "ÄŠ", "ÄŠÄŠ", "<0x0A>"])) - stop_token_ids = list(set([self.tokenizer.convert_tokens_to_ids(stop_token) for stop_token in stop] + stop_token_ids)) + stop_token_ids = list( + set([self.tokenizer.convert_tokens_to_ids(stop_token) for stop_token in stop] + stop_token_ids) + ) if "llama" in model_name.lower(): stop_token_ids.remove(self.tokenizer.unk_token_id) stop_token_ids = [x for x in stop_token_ids if x is not None] @@ -566,25 +591,31 @@ def __init__( if "gemma" in model_name.lower(): self.disable_prefill = True - logger.warning("gemma models cannot prefill with past kvs due to cache implementation, need to change the code manually if you need to prefill") - - + logger.warning( + "gemma models cannot prefill with past kvs due to cache implementation, need to change the code manually if you need to prefill" + ) + def prepare_inputs(self, test_item, data): return tokenize( - test_item, - data, - tokenizer=self.tokenizer, + test_item, + data, + tokenizer=self.tokenizer, max_length=self.max_length, generation_max_length=self.generation_max_length, use_chat_template=self.use_chat_template, ) - - + @torch.no_grad() def generate(self, inputs=None, prompt=None, **kwargs): if inputs is None: - inputs = self.tokenizer([prompt], return_tensors="pt", max_length=self.max_length-self.generation_max_length, truncation=True, padding=True) - + inputs = self.tokenizer( + [prompt], + return_tensors="pt", + max_length=self.max_length - self.generation_max_length, + truncation=True, + padding=True, + ) + inputs = inputs.to(self.model.device) input_len = inputs.input_ids.size(1) if hasattr(self.model, "model") and not self.disable_prefill: @@ -592,12 +623,21 @@ def generate(self, inputs=None, prompt=None, **kwargs): extra = {} if "jamba" in str(type(self.model)).lower(): from transformers.models.jamba.modeling_jamba import HybridMambaAttentionDynamicCache - cache = HybridMambaAttentionDynamicCache(self.model.config, inputs.input_ids.shape[0], self.model.dtype, device=self.model.device) + + cache = HybridMambaAttentionDynamicCache( + self.model.config, inputs.input_ids.shape[0], self.model.dtype, device=self.model.device + ) extra = {"past_key_values": cache} - prefill = self.model.model(input_ids=inputs.input_ids[..., :-1], attention_mask=inputs.attention_mask[..., :-1], **extra) + prefill = self.model.model( + input_ids=inputs.input_ids[..., :-1], attention_mask=inputs.attention_mask[..., :-1], **extra + ) past_key_values = prefill.past_key_values - inputs = {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "past_key_values": past_key_values} + inputs = { + "input_ids": inputs.input_ids, + "attention_mask": inputs.attention_mask, + "past_key_values": past_key_values, + } if past_key_values is None: self.disable_prefill = True logger.warning("past key values is None, not able to prefill with KVs, disabling...") @@ -614,22 +654,26 @@ def generate(self, inputs=None, prompt=None, **kwargs): return_dict_in_generate=True, output_scores=False, ) - text = self.tokenizer.decode(outputs['sequences'][0, input_len:], skip_special_tokens=True) - save_prompt = self.tokenizer.decode(inputs["input_ids"][0][:500]) + " " + self.tokenizer.decode(inputs["input_ids"][0][-500:]) + text = self.tokenizer.decode(outputs["sequences"][0, input_len:], skip_special_tokens=True) + save_prompt = ( + self.tokenizer.decode(inputs["input_ids"][0][:500]) + + " " + + self.tokenizer.decode(inputs["input_ids"][0][-500:]) + ) return { "output": text, "input_len": input_len, - "output_len": outputs['sequences'].size(1) - input_len, + "output_len": outputs["sequences"].size(1) - input_len, "input_text": save_prompt, } class VLLMModel(LLM): def __init__( - self, - model_name, - temperature=0.9, - top_p=0.9, + self, + model_name, + temperature=0.9, + top_p=0.9, max_length=32768, generation_max_length=2048, generation_min_length=0, @@ -638,9 +682,9 @@ def __init__( use_chat_template=False, ): super().__init__( - model_name, - temperature=temperature, - top_p=top_p, + model_name, + temperature=temperature, + top_p=top_p, max_length=max_length, generation_max_length=generation_max_length, generation_min_length=generation_min_length, @@ -648,10 +692,11 @@ def __init__( stop_newline=stop_newline, use_chat_template=use_chat_template, ) - + from vllm import LLM + # at the time of testing: note that the max model length is derived from the config file, and if max_length is larger than that length, there will be an error. it appears that vllm does not support positional extrapolation - # there are some work arounds to this, but it may give unexpected results. + # there are some work arounds to this, but it may give unexpected results. self.model = LLM( model_name, tensor_parallel_size=torch.cuda.device_count(), @@ -661,35 +706,44 @@ def __init__( ) self.tokenizer = self.model.get_tokenizer() - def prepare_inputs(self, test_item, data): return tokenize( - test_item, - data, - tokenizer=self.tokenizer, + test_item, + data, + tokenizer=self.tokenizer, max_length=self.max_length, generation_max_length=self.generation_max_length, use_chat_template=self.use_chat_template, ) - def generate(self, inputs=None, prompt=None, **kwargs): from vllm import SamplingParams, TokensPrompt + if inputs is None: - inputs = self.tokenizer([prompt], return_tensors="pt", max_length=self.max_length-self.generation_max_length, truncation=True, padding=True) - + inputs = self.tokenizer( + [prompt], + return_tensors="pt", + max_length=self.max_length - self.generation_max_length, + truncation=True, + padding=True, + ) + self.sampling_params = SamplingParams( - temperature = self.temperature if self.do_sample else 0.0, - top_p = self.top_p, - max_tokens = self.generation_max_length, + temperature=self.temperature if self.do_sample else 0.0, + top_p=self.top_p, + max_tokens=self.generation_max_length, ) outputs = self.model.generate( prompts=TokensPrompt(prompt_token_ids=inputs["input_ids"][0].tolist()), sampling_params=self.sampling_params, - **kwargs + **kwargs, )[0] - save_prompt = self.tokenizer.decode(inputs["input_ids"][0][:500]) + " " + self.tokenizer.decode(inputs["input_ids"][0][-500:]) + save_prompt = ( + self.tokenizer.decode(inputs["input_ids"][0][:500]) + + " " + + self.tokenizer.decode(inputs["input_ids"][0][-500:]) + ) return { "output": outputs.outputs[0].text, "input_len": len(outputs.prompt_token_ids), @@ -719,18 +773,18 @@ def load_LLM(args): kwargs["torch_dtype"] = torch.float32 if args.rope_theta is not None: kwargs["rope_theta"] = args.rope_theta - + model = model_cls( - args.model_name_or_path, - temperature=args.temperature, - top_p=args.top_p, - max_length=args.input_max_length, - generation_max_length=args.generation_max_length, - generation_min_length=args.generation_min_length, - do_sample=args.do_sample, - stop_newline=args.stop_newline, + args.model_name_or_path, + temperature=args.temperature, + top_p=args.top_p, + max_length=args.input_max_length, + generation_max_length=args.generation_max_length, + generation_min_length=args.generation_min_length, + do_sample=args.do_sample, + stop_newline=args.stop_newline, use_chat_template=args.use_chat_template, **kwargs, ) - return model \ No newline at end of file + return model diff --git a/evals/evaluation/HELMET/prompts/asqa_nocite.json b/evals/evaluation/HELMET/prompts/asqa_nocite.json index e77d3094..b25485cb 100644 --- a/evals/evaluation/HELMET/prompts/asqa_nocite.json +++ b/evals/evaluation/HELMET/prompts/asqa_nocite.json @@ -109,4 +109,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/evals/evaluation/HELMET/prompts/asqa_revised.json b/evals/evaluation/HELMET/prompts/asqa_revised.json index fc95fde6..f342ef56 100644 --- a/evals/evaluation/HELMET/prompts/asqa_revised.json +++ b/evals/evaluation/HELMET/prompts/asqa_revised.json @@ -109,4 +109,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/evals/evaluation/HELMET/requirements.txt b/evals/evaluation/HELMET/requirements.txt index 4d2628c7..cb592781 100644 --- a/evals/evaluation/HELMET/requirements.txt +++ b/evals/evaluation/HELMET/requirements.txt @@ -1,11 +1,11 @@ -wheel -ninja -packaging -torch -datasets -transformers accelerate -sentencepiece +datasets flash-attn +ninja +packaging pytrec_eval rouge_score +sentencepiece +torch +transformers +wheel diff --git a/evals/evaluation/HELMET/scripts/collect_results.py b/evals/evaluation/HELMET/scripts/collect_results.py index 6737ce1a..df91ce83 100644 --- a/evals/evaluation/HELMET/scripts/collect_results.py +++ b/evals/evaluation/HELMET/scripts/collect_results.py @@ -1,9 +1,13 @@ -import os +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import json +import os +from dataclasses import asdict, dataclass + import numpy as np import pandas as pd import yaml -from dataclasses import dataclass, asdict from tqdm import tqdm dataset_to_metrics = { @@ -12,19 +16,17 @@ "popqa": "substring_exact_match", "triviaqa": "substring_exact_match", "hotpotqa": "substring_exact_match", - - "narrativeqa": ["gpt-4-score",], + "narrativeqa": [ + "gpt-4-score", + ], "msmarco_rerank_psg": "NDCG@10", - "trec_coarse": "exact_match", "trec_fine": "exact_match", "banking77": "exact_match", "clinic150": "exact_match", "nlu": "exact_match", - "qmsum": "rougeL_recall", "multi_lexsum": ["gpt4-f1"], - "ruler_niah_s_1": "ruler_recall", "ruler_niah_s_2": "ruler_recall", "ruler_niah_s_3": "ruler_recall", @@ -38,28 +40,73 @@ "ruler_vt": "ruler_recall", "ruler_qa_1": "substring_exact_match", "ruler_qa_2": "substring_exact_match", - - "infbench_qa": [ "rougeL_f1"], + "infbench_qa": ["rougeL_f1"], "infbench_choice": ["exact_match"], "infbench_sum": ["gpt4-f1"], - "alce_asqa": ["str_em", "citation_rec", "citation_prec"], "alce_qampari": ["qampari_rec_top5", "citation_rec", "citation_prec"], } dataset_to_metrics = {k: [v] if isinstance(v, str) else v for k, v in dataset_to_metrics.items()} custom_avgs = { - "Recall": ["json_kv substring_exact_match", "ruler_niah_mk_2 ruler_recall", "ruler_niah_mk_3 ruler_recall", "ruler_niah_mv ruler_recall"], - "RAG": ['nq substring_exact_match', 'hotpotqa substring_exact_match', 'popqa substring_exact_match', 'triviaqa substring_exact_match',], - "ICL": ['trec_coarse exact_match', 'trec_fine exact_match', 'banking77 exact_match', 'clinic150 exact_match', 'nlu exact_match'], - "Cite": ['alce_asqa str_em', 'alce_asqa citation_rec', 'alce_asqa citation_prec', 'alce_qampari qampari_rec_top5', 'alce_qampari citation_rec', 'alce_qampari citation_prec', ], - "Re-rank": ['msmarco_rerank_psg NDCG@10', ], - "LongQA": ['narrativeqa gpt-4-score', 'infbench_qa rougeL_f1', 'infbench_choice exact_match', ], - "Summ": ['infbench_sum gpt4-f1', 'multi_lexsum gpt4-f1', ], - "RULER": ['ruler_niah_s_1 ruler_recall', 'ruler_niah_s_2 ruler_recall', 'ruler_niah_s_3 ruler_recall', 'ruler_niah_mk_1 ruler_recall', 'ruler_niah_mk_2 ruler_recall', 'ruler_niah_mk_3 ruler_recall', 'ruler_niah_mq ruler_recall', 'ruler_niah_mv ruler_recall', 'ruler_cwe ruler_recall', 'ruler_fwe ruler_recall', 'ruler_vt ruler_recall', 'ruler_qa_1 substring_exact_match', 'ruler_qa_2 substring_exact_match'], - "Ours-Real": ['RAG', 'ICL', 'Cite', 'Re-rank', 'LongQA', 'Summ'], - "Ours": ['Recall', 'RAG', 'ICL', 'Cite', 'Re-rank', 'LongQA', 'Summ'], + "Recall": [ + "json_kv substring_exact_match", + "ruler_niah_mk_2 ruler_recall", + "ruler_niah_mk_3 ruler_recall", + "ruler_niah_mv ruler_recall", + ], + "RAG": [ + "nq substring_exact_match", + "hotpotqa substring_exact_match", + "popqa substring_exact_match", + "triviaqa substring_exact_match", + ], + "ICL": [ + "trec_coarse exact_match", + "trec_fine exact_match", + "banking77 exact_match", + "clinic150 exact_match", + "nlu exact_match", + ], + "Cite": [ + "alce_asqa str_em", + "alce_asqa citation_rec", + "alce_asqa citation_prec", + "alce_qampari qampari_rec_top5", + "alce_qampari citation_rec", + "alce_qampari citation_prec", + ], + "Re-rank": [ + "msmarco_rerank_psg NDCG@10", + ], + "LongQA": [ + "narrativeqa gpt-4-score", + "infbench_qa rougeL_f1", + "infbench_choice exact_match", + ], + "Summ": [ + "infbench_sum gpt4-f1", + "multi_lexsum gpt4-f1", + ], + "RULER": [ + "ruler_niah_s_1 ruler_recall", + "ruler_niah_s_2 ruler_recall", + "ruler_niah_s_3 ruler_recall", + "ruler_niah_mk_1 ruler_recall", + "ruler_niah_mk_2 ruler_recall", + "ruler_niah_mk_3 ruler_recall", + "ruler_niah_mq ruler_recall", + "ruler_niah_mv ruler_recall", + "ruler_cwe ruler_recall", + "ruler_fwe ruler_recall", + "ruler_vt ruler_recall", + "ruler_qa_1 substring_exact_match", + "ruler_qa_2 substring_exact_match", + ], + "Ours-Real": ["RAG", "ICL", "Cite", "Re-rank", "LongQA", "Summ"], + "Ours": ["Recall", "RAG", "ICL", "Cite", "Re-rank", "LongQA", "Summ"], } + @dataclass class arguments: tag: str = "v1" @@ -79,25 +126,30 @@ class arguments: output_dir: str = "output" popularity_threshold: float = 3 flenqa_ctx_size: int = 1000 - + category: str = "synthetic" - + def update(self, new): for key, value in new.items(): if hasattr(self, key): setattr(self, key, value) - + def get_path(self): tag = self.tag if "flenqa" in self.dataset: tag += f"_ctx{self.flenqa_ctx_size}" - path = os.path.join(self.output_dir, "{args.dataset}_{tag}_{args.test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json".format(args=self, tag=tag)) + path = os.path.join( + self.output_dir, + "{args.dataset}_{tag}_{args.test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json".format( + args=self, tag=tag + ), + ) if os.path.exists(path.replace(".json", "-gpt4eval_o.json")): return path.replace(".json", "-gpt4eval_o.json") if "alce" in self.dataset: return path.replace(".json", ".json.score") - + if os.path.exists(path + ".score"): return path + ".score" return path @@ -107,7 +159,7 @@ def get_metric_name(self): if d in self.dataset: return d, m return None - + def get_averaged_metric(self): path = self.get_path() print(path) @@ -116,7 +168,7 @@ def get_averaged_metric(self): return None with open(path) as f: results = json.load(f) - + _, metric = self.get_metric_name() if path.endswith(".score"): if any([m not in results for m in metric]): @@ -127,22 +179,22 @@ def get_averaged_metric(self): if any([m not in results["averaged_metrics"] for m in metric]): print("metric doesn't exist") return None - s = {m: results['averaged_metrics'][m] for m in metric} - - s = {m : v * (100 if m == "gpt4-f1" else 1) * (100/3 if m == "gpt-4-score" else 1) for m, v in s.items()} + s = {m: results["averaged_metrics"][m] for m in metric} + + s = {m: v * (100 if m == "gpt4-f1" else 1) * (100 / 3 if m == "gpt-4-score" else 1) for m, v in s.items()} print("found scores:", s) return s - + def get_metric_by_depth(self): path = self.get_path() - path = path.replace(".score", '') + path = path.replace(".score", "") print(path) if not os.path.exists(path): return None with open(path) as f: results = json.load(f) - output = [] + output = [] _, metric = self.get_metric_name() metric = metric[0] keys = ["depth", "k", metric] @@ -150,19 +202,20 @@ def get_metric_by_depth(self): o = {} for key in keys: if key == "k" and "ctxs" in d: - d["k"] = len(d['ctxs']) + d["k"] = len(d["ctxs"]) if key not in d: print("no", key) return None o[key] = d[key] o["metric"] = o.pop(metric) output.append(o) - + df = pd.DataFrame(output) dfs = df.groupby(list(output[0].keys())[:-1]).mean().reset_index() return dfs.to_dict("records") + if __name__ == "__main__": # comment out the models you don't want to include models_configs = [ @@ -174,14 +227,12 @@ def get_metric_by_depth(self): {"model": "claude-3-5-sonnet-20240620", "use_chat_template": True, "training_length": 200000}, {"model": "gemini-1.5-flash-001", "use_chat_template": True, "training_length": 1048576}, {"model": "gemini-1.5-pro-001", "use_chat_template": True, "training_length": 2097152}, - # llama 2 based models {"model": "LLaMA-2-7B-32K", "use_chat_template": False, "training_length": 32768}, {"model": "Llama-2-7B-32K-Instruct", "training_length": 32768}, {"model": "llama-2-7b-80k-basefixed", "use_chat_template": False, "training_length": 80000}, {"model": "Yarn-Llama-2-7b-64k", "use_chat_template": False, "training_length": 65536}, {"model": "Yarn-Llama-2-7b-128k", "use_chat_template": False, "training_length": 131072}, - # llama 3 models {"model": "Meta-Llama-3-8B", "use_chat_template": False, "training_length": 8192}, {"model": "Meta-Llama-3-8B-Instruct", "training_length": 8192}, @@ -189,58 +240,57 @@ def get_metric_by_depth(self): {"model": "Meta-Llama-3-8B-Instruct-Theta8M", "training_length": 8192}, {"model": "Meta-Llama-3-70B-Theta8M", "use_chat_template": False, "training_length": 8192}, {"model": "Meta-Llama-3-70B-Instruct-Theta8M", "training_length": 8192}, - {"model": "Meta-Llama-3.1-8B", "use_chat_template": False, "training_length": 131072}, {"model": "Meta-Llama-3.1-8B-Instruct", "training_length": 131072}, {"model": "Meta-Llama-3.1-70B", "use_chat_template": False, "training_length": 131072}, {"model": "Meta-Llama-3.1-70B-Instruct", "training_length": 131072}, - {"model": "Llama-3.2-1B", "use_chat_template": False, "training_length": 131072}, {"model": "Llama-3.2-1B-Instruct", "training_length": 131072}, {"model": "Llama-3.2-3B", "use_chat_template": False, "training_length": 131072}, {"model": "Llama-3.2-3B-Instruct", "training_length": 131072}, - # mistral models {"model": "Mistral-7B-v0.1", "use_chat_template": False, "training_length": 8192}, {"model": "Mistral-7B-Instruct-v0.1", "training_length": 8192}, {"model": "Mistral-7B-Instruct-v0.2", "training_length": 32768}, {"model": "Mistral-7B-v0.3", "use_chat_template": False, "training_length": 32768}, {"model": "Mistral-7B-Instruct-v0.3", "training_length": 32768}, - {"model": "Mistral-Nemo-Base-2407", "use_chat_template": False, "training_length": 128000}, {"model": "Mistral-Nemo-Instruct-2407", "training_length": 128000}, {"model": "MegaBeam-Mistral-7B-512k", "training_length": 524288}, - # yi models {"model": "Yi-6B-200K", "use_chat_template": False, "training_length": 200000}, {"model": "Yi-9B-200K", "use_chat_template": False, "training_length": 200000}, {"model": "Yi-34B-200K", "use_chat_template": False, "training_length": 200000}, {"model": "Yi-1.5-9B-32K", "use_chat_template": False, "training_length": 32768}, - # phi models {"model": "Phi-3-mini-128k-instruct", "training_length": 131072}, {"model": "Phi-3-small-128k-instruct", "training_length": 131072}, {"model": "Phi-3-medium-128k-instruct", "training_length": 131072}, {"model": "Phi-3.5-mini-instruct", "training_length": 131072}, - # qwen models {"model": "Qwen2-7B", "use_chat_template": False, "training_length": 32768}, {"model": "Qwen2-7B-Instruct", "training_length": 32768}, {"model": "Qwen2-57B-A14B", "use_chat_template": False, "training_length": 32768}, {"model": "Qwen2-57B-A14B-Instruct", "training_length": 32768}, - # others {"model": "c4ai-command-r-v01", "training_length": 131072}, {"model": "Jamba-v0.1", "use_chat_template": False, "training_length": 262144}, {"model": "AI21-Jamba-1.5-Mini", "training_length": 262144}, - # prolong {"model": "prolong-64k-instruct", "training_length": 65536}, {"model": "prolong-512k-instruct-20b-theta128m", "training_length": 524288}, ] # set your configs here - configs = ["configs/recall.yaml", "configs/rag.yaml", "configs/rerank.yaml", "configs/cite.yaml", "configs/longqa.yaml", "configs/summ.yaml", "configs/icl.yaml"] + configs = [ + "configs/recall.yaml", + "configs/rag.yaml", + "configs/rerank.yaml", + "configs/cite.yaml", + "configs/longqa.yaml", + "configs/summ.yaml", + "configs/icl.yaml", + ] datasets_configs = [] for config in configs: c = yaml.safe_load(open(config)) @@ -249,15 +299,30 @@ def get_metric_by_depth(self): c["generation_max_length"] = ",".join([str(c["generation_max_length"])] * len(c["datasets"].split(","))) if isinstance(c["input_max_length"], int): c["input_max_length"] = ",".join([str(c["input_max_length"])] * len(c["datasets"].split(","))) - for d, t, l, g in zip(c['datasets'].split(','), c['test_files'].split(','), c['input_max_length'].split(','), c['generation_max_length'].split(',')): - datasets_configs.append({"dataset": d, "test_name": os.path.basename(os.path.splitext(t)[0]), "input_max_length": int(l), "generation_max_length": int(g), "use_chat_template": c["use_chat_template"], "max_test_samples": c["max_test_samples"], 'shots': c['shots']}) - + for d, t, l, g in zip( + c["datasets"].split(","), + c["test_files"].split(","), + c["input_max_length"].split(","), + c["generation_max_length"].split(","), + ): + datasets_configs.append( + { + "dataset": d, + "test_name": os.path.basename(os.path.splitext(t)[0]), + "input_max_length": int(l), + "generation_max_length": int(g), + "use_chat_template": c["use_chat_template"], + "max_test_samples": c["max_test_samples"], + "shots": c["shots"], + } + ) + df = [] for model in tqdm(models_configs): args = arguments() - args.tag = "v1" # SET YOUR TAG HERE + args.tag = "v1" # SET YOUR TAG HERE args.output_dir = f"output/{model['model']}" - + for dataset in datasets_configs: args.update(dataset) args.update(model) @@ -267,16 +332,30 @@ def get_metric_by_depth(self): if metric is None: continue - + for k, m in metric.items(): - df.append({**asdict(args), **model, - "metric name": k, "metric": m, - "dataset_simple": dsimple + " " + k, "test_data": f"{args.dataset}-{args.test_name}-{args.input_max_length}" - }) + df.append( + { + **asdict(args), + **model, + "metric name": k, + "metric": m, + "dataset_simple": dsimple + " " + k, + "test_data": f"{args.dataset}-{args.test_name}-{args.input_max_length}", + } + ) all_df = pd.DataFrame(df) - lf_df = all_df.pivot_table(index=["model", "input_max_length", ], columns="dataset_simple", values="metric", sort=False) + lf_df = all_df.pivot_table( + index=[ + "model", + "input_max_length", + ], + columns="dataset_simple", + values="metric", + sort=False, + ) lf_df = lf_df.reset_index() print(lf_df.to_csv(index=False)) - # import pdb; pdb.set_trace() \ No newline at end of file + # import pdb; pdb.set_trace() diff --git a/evals/evaluation/HELMET/scripts/download_data.sh b/evals/evaluation/HELMET/scripts/download_data.sh index 7aaed21b..e4bd1960 100644 --- a/evals/evaluation/HELMET/scripts/download_data.sh +++ b/evals/evaluation/HELMET/scripts/download_data.sh @@ -1,2 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + wget -c https://huggingface.co/datasets/princeton-nlp/HELMET/resolve/main/data.tar.gz tar -xvzf data.tar.gz diff --git a/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.py b/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.py index c87b3f24..52a0aeb0 100644 --- a/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.py +++ b/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.py @@ -1,28 +1,36 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse +import glob import json import os -import sys import re +import sys + from tqdm import tqdm -import glob # Get the parent directory path -parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # Add the parent directory to the Python path sys.path.append(parent_dir) from model_utils import OpenAIModel + def parse_output(output, prefix="Answer:"): output = output.replace("\n", " ") def lstrip_string(s, sub): - return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE) + return re.sub(f"^{re.escape(sub)}", "", s, flags=re.IGNORECASE) + patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")] for pat in patterns: matches = pat.search(output) if matches is not None: - return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix + return lstrip_string( + matches[1].strip(), prefix + ).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix # if still not found, return None, but should actually never get this case... return None @@ -50,6 +58,7 @@ def lstrip_string(s, sub): Answer: {parsed_output} """ + def parse_json(text): matches = re.findall(r"\{.*?\}", text, re.DOTALL) if len(matches) > 0: @@ -60,6 +69,7 @@ def parse_json(text): return r return None + def check_metrics(model, results_file, output_file): with open(results_file, "r") as f: results = json.load(f) @@ -67,7 +77,9 @@ def check_metrics(model, results_file, output_file): sum_score = 0 count_score = 0 for idx, d in enumerate(tqdm(results["data"])): - p = judge_prompt.format(question=d['question'], correct_answers=d['answer'], parsed_output=parse_output(d['output'])) + p = judge_prompt.format( + question=d["question"], correct_answers=d["answer"], parsed_output=parse_output(d["output"]) + ) o = model.generate(prompt=p) s = None @@ -98,6 +110,7 @@ def check_metrics(model, results_file, output_file): return results + if __name__ == "__main__": model = OpenAIModel("azure/gpt-4o-2024-05-13", temperature=0.1) parser = argparse.ArgumentParser() @@ -108,13 +121,93 @@ def check_metrics(model, results_file, output_file): shard_idx = args.shard_idx # instruct models - model_to_check = ['gpt-4-0125-preview', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'gpt-4o-mini-2024-07-18', 'claude-3-5-sonnet-20240620', 'gemini-1.5-flash-001', 'gemini-1.5-pro-001', 'Meta-Llama-3-8B-Instruct', 'Meta-Llama-3-8B-Instruct-Theta8M', 'Meta-Llama-3-70B-Instruct-Theta8M', 'Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-70B-Instruct', 'Mistral-7B-Instruct-v0.1', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-Instruct-v0.3', 'Mistral-Nemo-Instruct-2407', 'Phi-3-mini-128k-instruct', 'Phi-3-small-128k-instruct', 'Phi-3-medium-128k-instruct', 'Phi-3.5-mini-instruct', 'Qwen2-7B-Instruct', 'Qwen2-57B-A14B-Instruct', 'c4ai-command-r-v01', 'AI21-Jamba-1.5-Mini', 'prolong-64k-instruct', 'prolong-512k-instruct-20b-theta128m', "MegaBeam-Mistral-7B-512k"] + model_to_check = [ + "gpt-4-0125-preview", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "gpt-4o-mini-2024-07-18", + "claude-3-5-sonnet-20240620", + "gemini-1.5-flash-001", + "gemini-1.5-pro-001", + "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3-8B-Instruct-Theta8M", + "Meta-Llama-3-70B-Instruct-Theta8M", + "Meta-Llama-3.1-8B-Instruct", + "Meta-Llama-3.1-70B-Instruct", + "Mistral-7B-Instruct-v0.1", + "Mistral-7B-Instruct-v0.2", + "Mistral-7B-Instruct-v0.3", + "Mistral-Nemo-Instruct-2407", + "Phi-3-mini-128k-instruct", + "Phi-3-small-128k-instruct", + "Phi-3-medium-128k-instruct", + "Phi-3.5-mini-instruct", + "Qwen2-7B-Instruct", + "Qwen2-57B-A14B-Instruct", + "c4ai-command-r-v01", + "AI21-Jamba-1.5-Mini", + "prolong-64k-instruct", + "prolong-512k-instruct-20b-theta128m", + "MegaBeam-Mistral-7B-512k", + ] # all models - model_to_check = ['gpt-4-0125-preview', 'gpt-4o-mini-2024-07-18', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'claude-3-5-sonnet-20240620', 'gemini-1.5-flash-001', 'gemini-1.5-pro-001', 'LLaMA-2-7B-32K', 'Llama-2-7B-32K-Instruct', 'llama-2-7b-80k-basefixed', 'Yarn-Llama-2-7b-64k', 'Yarn-Llama-2-7b-128k', 'Meta-Llama-3-8B', 'Meta-Llama-3-8B-Instruct', 'Meta-Llama-3-8B-Theta8M', 'Meta-Llama-3-8B-Instruct-Theta8M', 'Meta-Llama-3-70B-Theta8M', 'Meta-Llama-3-70B-Instruct-Theta8M', 'Meta-Llama-3.1-8B', 'Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-70B', 'Meta-Llama-3.1-70B-Instruct', 'Llama-3.2-1B', 'Llama-3.2-1B-Instruct', 'Llama-3.2-3B', 'Llama-3.2-3B-Instruct', 'Mistral-7B-v0.1', 'Mistral-7B-Instruct-v0.1', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-v0.3', 'Mistral-7B-Instruct-v0.3', 'Mistral-Nemo-Base-2407', 'Mistral-Nemo-Instruct-2407', 'MegaBeam-Mistral-7B-512k', 'Yi-6B-200K', 'Yi-9B-200K', 'Yi-34B-200K', 'Yi-1.5-9B-32K', 'Phi-3-mini-128k-instruct', 'Phi-3-small-128k-instruct', 'Phi-3-medium-128k-instruct', 'Phi-3.5-mini-instruct', 'Qwen2-7B', 'Qwen2-7B-Instruct', 'Qwen2-57B-A14B', 'Qwen2-57B-A14B-Instruct', 'c4ai-command-r-v01', 'Jamba-v0.1', 'AI21-Jamba-1.5-Mini', 'prolong-64k-instruct', 'prolong-512k-instruct-20b-theta128m'] + model_to_check = [ + "gpt-4-0125-preview", + "gpt-4o-mini-2024-07-18", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "claude-3-5-sonnet-20240620", + "gemini-1.5-flash-001", + "gemini-1.5-pro-001", + "LLaMA-2-7B-32K", + "Llama-2-7B-32K-Instruct", + "llama-2-7b-80k-basefixed", + "Yarn-Llama-2-7b-64k", + "Yarn-Llama-2-7b-128k", + "Meta-Llama-3-8B", + "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3-8B-Theta8M", + "Meta-Llama-3-8B-Instruct-Theta8M", + "Meta-Llama-3-70B-Theta8M", + "Meta-Llama-3-70B-Instruct-Theta8M", + "Meta-Llama-3.1-8B", + "Meta-Llama-3.1-8B-Instruct", + "Meta-Llama-3.1-70B", + "Meta-Llama-3.1-70B-Instruct", + "Llama-3.2-1B", + "Llama-3.2-1B-Instruct", + "Llama-3.2-3B", + "Llama-3.2-3B-Instruct", + "Mistral-7B-v0.1", + "Mistral-7B-Instruct-v0.1", + "Mistral-7B-Instruct-v0.2", + "Mistral-7B-v0.3", + "Mistral-7B-Instruct-v0.3", + "Mistral-Nemo-Base-2407", + "Mistral-Nemo-Instruct-2407", + "MegaBeam-Mistral-7B-512k", + "Yi-6B-200K", + "Yi-9B-200K", + "Yi-34B-200K", + "Yi-1.5-9B-32K", + "Phi-3-mini-128k-instruct", + "Phi-3-small-128k-instruct", + "Phi-3-medium-128k-instruct", + "Phi-3.5-mini-instruct", + "Qwen2-7B", + "Qwen2-7B-Instruct", + "Qwen2-57B-A14B", + "Qwen2-57B-A14B-Instruct", + "c4ai-command-r-v01", + "Jamba-v0.1", + "AI21-Jamba-1.5-Mini", + "prolong-64k-instruct", + "prolong-512k-instruct-20b-theta128m", + ] # customize this line according to the file pahts that you want to check - all_paths = [glob.glob(f"output/{m}/narrativeqa_*.json") for m in model_to_check] + all_paths = [glob.glob(f"output/{m}/narrativeqa_*.json") for m in model_to_check] all_paths = [p for p in all_paths if not os.path.exists(p.replace(".json", "-gpt4eval_o.json"))] all_paths = all_paths[shard_idx::num_shards] diff --git a/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.sh b/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.sh index 9fc2bc84..7d08031f 100644 --- a/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.sh +++ b/evals/evaluation/HELMET/scripts/eval_gpt4_longqa.sh @@ -1 +1,4 @@ -for i in {0..15}; do python scripts/eval_gpt4_longqa.py --num_shards 16 --shard_idx $i & done \ No newline at end of file +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +for i in {0..15}; do python scripts/eval_gpt4_longqa.py --num_shards 16 --shard_idx $i & done diff --git a/evals/evaluation/HELMET/scripts/eval_gpt4_summ.py b/evals/evaluation/HELMET/scripts/eval_gpt4_summ.py index 6cc75945..7dca7b4b 100644 --- a/evals/evaluation/HELMET/scripts/eval_gpt4_summ.py +++ b/evals/evaluation/HELMET/scripts/eval_gpt4_summ.py @@ -1,21 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse +import glob import json import os -import sys import re -from tqdm import tqdm -import glob +import sys import numpy as np +from tqdm import tqdm + # Get the parent directory path -parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # Add the parent directory to the Python path sys.path.append(parent_dir) from model_utils import OpenAIModel # prompts inspired by https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG -fluency_prompt="""Please act as an impartial judge and evaluate the fluency of the provided text. The text should be coherent, non-repetitive, fluent, and grammatically correct. +fluency_prompt = """Please act as an impartial judge and evaluate the fluency of the provided text. The text should be coherent, non-repetitive, fluent, and grammatically correct. Below is your grading rubric: - Score 0 (incoherent, repetitive, or incomplete): Incoherent sentences, repetitive sentences (even if not by exact words), incomplete answers, or gibberish. Note that even if the answer is coherent, if it is repetitive or incomplete, it should be given a score of 0. @@ -34,7 +38,7 @@ Text: "{text}" """ -fluency_prompt_book="""Please act as an impartial judge and evaluate the fluency of the provided text. The text should be coherent, non-repetitive, fluent, and grammatically correct. +fluency_prompt_book = """Please act as an impartial judge and evaluate the fluency of the provided text. The text should be coherent, non-repetitive, fluent, and grammatically correct. Below is your grading rubric: - Score 0 (incoherent, repetitive, or incomplete): Incoherent sentences, repetitive sentences (even if not by exact words), incomplete answers, or gibberish. Note that even if the answer is coherent, if it is repetitive or incomplete, it should be given a score of 0. @@ -52,7 +56,7 @@ Text: "{text}" """ -recall_prompt="""Please act as an impartial judge and evaluate the quality of the provided summary of a civil lawsuit. The summary is based on a set of legal documents, and it should contain a short description of the background, the parties involved, and the outcomes of the case. The text should contain all the major points in the expert-written summary, which are given to you. +recall_prompt = """Please act as an impartial judge and evaluate the quality of the provided summary of a civil lawsuit. The summary is based on a set of legal documents, and it should contain a short description of the background, the parties involved, and the outcomes of the case. The text should contain all the major points in the expert-written summary, which are given to you. Below is your grading rubric: Recall: @@ -102,7 +106,7 @@ """ -recall_prompt_book="""Please act as an impartial judge and evaluate the quality of the provided summary of a novel. It should discuss the plots and characters of the story. The text should contain all the given key points. +recall_prompt_book = """Please act as an impartial judge and evaluate the quality of the provided summary of a novel. It should discuss the plots and characters of the story. The text should contain all the given key points. Below is your grading rubric: Recall: @@ -213,7 +217,7 @@ """ -precision_prompt="""Please act as an impartial judge and evaluate the quality of the provided summary of a civil lawsuit. The summary is based on a set of legal documents, and it should contain a short description of the background, the parties involved, and the outcomes of the case. +precision_prompt = """Please act as an impartial judge and evaluate the quality of the provided summary of a civil lawsuit. The summary is based on a set of legal documents, and it should contain a short description of the background, the parties involved, and the outcomes of the case. Below is your grading rubric: Precision: @@ -250,7 +254,7 @@ """ -precision_prompt_book="""Please act as an impartial judge and evaluate the quality of the provided summary of a novel. +precision_prompt_book = """Please act as an impartial judge and evaluate the quality of the provided summary of a novel. Below is your grading rubric: Precision: @@ -337,6 +341,7 @@ def parse_json(text): return json.loads(matches[-1]) return None + def check_metrics(model, results_file, output_file): with open(results_file, "r") as f: results = json.load(f) @@ -353,17 +358,22 @@ def check_metrics(model, results_file, output_file): d = json.loads(line) keypoints[d["id"]] = d["summary/short_keypoints"] - for idx, d in enumerate(tqdm(results["data"])): d["keypoints"] = keypoints[d["id"]] if "infbench" in results_file: fp = fluency_prompt_book.format(text=d["output"].strip()) - rp = recall_prompt_book.format(keypoints="\n".join([f"{i+1}. {kp}" for i, kp in enumerate(d["keypoints"])]), summary=d["output"].strip()) + rp = recall_prompt_book.format( + keypoints="\n".join([f"{i+1}. {kp}" for i, kp in enumerate(d["keypoints"])]), + summary=d["output"].strip(), + ) pp = precision_prompt_book.format(expert_summary=d["answer"][0], summary=d["output"].strip()) else: fp = fluency_prompt.format(text=d["output"].strip()) - rp = recall_prompt.format(keypoints="\n".join([f"{i+1}. {kp}" for i, kp in enumerate(d["keypoints"])]), summary=d["output"].strip()) + rp = recall_prompt.format( + keypoints="\n".join([f"{i+1}. {kp}" for i, kp in enumerate(d["keypoints"])]), + summary=d["output"].strip(), + ) pp = precision_prompt.format(expert_summary=d["summary/long"], summary=d["output"].strip()) def get_score(prompt, tries=2): @@ -412,7 +422,9 @@ def get_score(prompt, tries=2): print(f"Scores: {d['gpt4-scores']}") else: print("Warning! Couldn't get a score") - print(f"GPT-4 output: \n---fluency call---\n{fo['output']}\n---recall call---\n{ro['output']}\n---precision call---\n{po['output']}\n------") + print( + f"GPT-4 output: \n---fluency call---\n{fo['output']}\n---recall call---\n{ro['output']}\n---precision call---\n{po['output']}\n------" + ) # import pdb; pdb.set_trace() if len([d for d in results["data"] if "gpt4-scores" in d]) == 0: raise Exception("No scores found") @@ -431,6 +443,7 @@ def get_score(prompt, tries=2): return results + if __name__ == "__main__": model = OpenAIModel("azure/gpt-4o-2024-05-13", temperature=0.1, generation_max_length=4096) @@ -442,12 +455,81 @@ def get_score(prompt, tries=2): shard_idx = args.shard_idx # this is all of our chat models - model_to_check = ['gpt-4-0125-preview', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'gpt-4o-mini-2024-07-18', 'claude-3-5-sonnet-20240620', 'gemini-1.5-flash-001', 'gemini-1.5-pro-001', 'Meta-Llama-3-8B-Instruct', 'Meta-Llama-3-8B-Instruct-Theta8M', 'Meta-Llama-3-70B-Instruct-Theta8M', 'Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-70B-Instruct', 'Mistral-7B-Instruct-v0.1', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-Instruct-v0.3', 'Mistral-Nemo-Instruct-2407', 'Phi-3-mini-128k-instruct', 'Phi-3-small-128k-instruct', 'Phi-3-medium-128k-instruct', 'Phi-3.5-mini-instruct', 'Qwen2-7B-Instruct', 'Qwen2-57B-A14B-Instruct', 'c4ai-command-r-v01', 'AI21-Jamba-1.5-Mini', 'prolong-64k-instruct', 'prolong-512k-instruct-20b-theta128m', "MegaBeam-Mistral-7B-512k"] - - model_to_check = ['gpt-4-0125-preview', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'gpt-4o-mini-2024-07-18', 'claude-3-5-sonnet-20240620', 'gemini-1.5-flash-001', 'gemini-1.5-pro-001', 'Meta-Llama-3-8B-Theta8M', 'Meta-Llama-3-8B-Instruct-Theta8M', 'Meta-Llama-3-70B-Theta8M', 'Meta-Llama-3-70B-Instruct-Theta8M', 'Meta-Llama-3.1-8B', 'Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-70B', 'Meta-Llama-3.1-70B-Instruct', "Llama-3.2-1B", "Llama-3.2-1B-Instruct", "Llama-3.2-3B", "Llama-3.2-3B-Instruct", 'llama-2-7b-80k-basefixed', 'Yarn-Llama-2-7b-128k', 'Mistral-7B-Instruct-v0.1', 'Mistral-7B-Instruct-v0.2', 'Mistral-7B-v0.3', 'Mistral-7B-Instruct-v0.3', 'Mistral-Nemo-Instruct-2407', 'MegaBeam-Mistral-7B-512k', 'Phi-3-mini-128k-instruct', 'Phi-3-small-128k-instruct', 'Phi-3-medium-128k-instruct', 'Phi-3.5-mini-instruct', 'Yi-6B-200K', 'Yi-9B-200K', 'Yi-34B-200K', 'Qwen2-7B-Instruct', 'Qwen2-57B-A14B-Instruct', 'AI21-Jamba-1.5-Mini', 'prolong-512k-instruct-20b-theta128m',] - - #just replace the glob pattern - all_paths = [glob.glob(f"output/{m}/multi_lexsum_*_v12_*max400min*.json") for m in model_to_check] + [glob.glob(f"output/{m}/infbench_sum_*_v12_*max1200min*.json") for m in model_to_check] + model_to_check = [ + "gpt-4-0125-preview", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "gpt-4o-mini-2024-07-18", + "claude-3-5-sonnet-20240620", + "gemini-1.5-flash-001", + "gemini-1.5-pro-001", + "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3-8B-Instruct-Theta8M", + "Meta-Llama-3-70B-Instruct-Theta8M", + "Meta-Llama-3.1-8B-Instruct", + "Meta-Llama-3.1-70B-Instruct", + "Mistral-7B-Instruct-v0.1", + "Mistral-7B-Instruct-v0.2", + "Mistral-7B-Instruct-v0.3", + "Mistral-Nemo-Instruct-2407", + "Phi-3-mini-128k-instruct", + "Phi-3-small-128k-instruct", + "Phi-3-medium-128k-instruct", + "Phi-3.5-mini-instruct", + "Qwen2-7B-Instruct", + "Qwen2-57B-A14B-Instruct", + "c4ai-command-r-v01", + "AI21-Jamba-1.5-Mini", + "prolong-64k-instruct", + "prolong-512k-instruct-20b-theta128m", + "MegaBeam-Mistral-7B-512k", + ] + + model_to_check = [ + "gpt-4-0125-preview", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "gpt-4o-mini-2024-07-18", + "claude-3-5-sonnet-20240620", + "gemini-1.5-flash-001", + "gemini-1.5-pro-001", + "Meta-Llama-3-8B-Theta8M", + "Meta-Llama-3-8B-Instruct-Theta8M", + "Meta-Llama-3-70B-Theta8M", + "Meta-Llama-3-70B-Instruct-Theta8M", + "Meta-Llama-3.1-8B", + "Meta-Llama-3.1-8B-Instruct", + "Meta-Llama-3.1-70B", + "Meta-Llama-3.1-70B-Instruct", + "Llama-3.2-1B", + "Llama-3.2-1B-Instruct", + "Llama-3.2-3B", + "Llama-3.2-3B-Instruct", + "llama-2-7b-80k-basefixed", + "Yarn-Llama-2-7b-128k", + "Mistral-7B-Instruct-v0.1", + "Mistral-7B-Instruct-v0.2", + "Mistral-7B-v0.3", + "Mistral-7B-Instruct-v0.3", + "Mistral-Nemo-Instruct-2407", + "MegaBeam-Mistral-7B-512k", + "Phi-3-mini-128k-instruct", + "Phi-3-small-128k-instruct", + "Phi-3-medium-128k-instruct", + "Phi-3.5-mini-instruct", + "Yi-6B-200K", + "Yi-9B-200K", + "Yi-34B-200K", + "Qwen2-7B-Instruct", + "Qwen2-57B-A14B-Instruct", + "AI21-Jamba-1.5-Mini", + "prolong-512k-instruct-20b-theta128m", + ] + + # just replace the glob pattern + all_paths = [glob.glob(f"output/{m}/multi_lexsum_*_v12_*max400min*.json") for m in model_to_check] + [ + glob.glob(f"output/{m}/infbench_sum_*_v12_*max1200min*.json") for m in model_to_check + ] all_paths = [item for sublist in all_paths for item in sublist if item.endswith(".json")] all_paths = [p for p in all_paths if not os.path.exists(p.replace(".json", "-gpt4eval_o.json"))] @@ -459,4 +541,3 @@ def get_score(prompt, tries=2): newp = p.replace(".json", "-gpt4eval_o.json") print("evaluating") check_metrics(model, p, newp) - diff --git a/evals/evaluation/HELMET/scripts/eval_gpt4_summ.sh b/evals/evaluation/HELMET/scripts/eval_gpt4_summ.sh index 85bf0ac7..0168e661 100644 --- a/evals/evaluation/HELMET/scripts/eval_gpt4_summ.sh +++ b/evals/evaluation/HELMET/scripts/eval_gpt4_summ.sh @@ -1 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + for i in {0..15}; do python scripts/eval_gpt4_summ.py --num_shards 16 --shard_idx $i & done diff --git a/evals/evaluation/HELMET/scripts/generate_configs.py b/evals/evaluation/HELMET/scripts/generate_configs.py index 898732a7..7d595bbe 100644 --- a/evals/evaluation/HELMET/scripts/generate_configs.py +++ b/evals/evaluation/HELMET/scripts/generate_configs.py @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import yaml # cannot be shared ones: use_chat_template, shots, and stop_new_line @@ -5,90 +8,120 @@ lengths_mapping = {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072} master_mapping = { # ruler tasks, shots: 0, use_chat_template: False, and stop_new_line: False - "ruler_niah_s_1": { # NIAH Repeat + "ruler_niah_s_1": { # NIAH Repeat k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_1/validation_{v}.jsonl" - } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_single_1/validation_{v}.jsonl", + } + for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() }, - "ruler_niah_s_2": { # NIAH + "ruler_niah_s_2": { # NIAH k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_2/validation_{v}.jsonl" - } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_single_2/validation_{v}.jsonl", + } + for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() }, - "ruler_niah_s_3": { # NIAH UUID + "ruler_niah_s_3": { # NIAH UUID k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_3/validation_{v}.jsonl" - } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_single_3/validation_{v}.jsonl", + } + for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() }, - "ruler_niah_mk_1": { # NIAH MK Essay + "ruler_niah_mk_1": { # NIAH MK Essay k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_1/validation_{v}.jsonl" - } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_multikey_1/validation_{v}.jsonl", + } + for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() }, - "ruler_niah_mk_2": { # NIAH MK Needle + "ruler_niah_mk_2": { # NIAH MK Needle k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_2/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_multikey_2/validation_{v}.jsonl", + } + for k, v in lengths_mapping.items() }, - "ruler_niah_mk_3": { # NIAH MK UUID + "ruler_niah_mk_3": { # NIAH MK UUID k: { - "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multikey_3/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 100, + "test_files": f"data/ruler/niah_multikey_3/validation_{v}.jsonl", + } + for k, v in lengths_mapping.items() }, - "ruler_niah_mq": { # NIAH MQ + "ruler_niah_mq": { # NIAH MQ k: { - "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multiquery/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 100, + "test_files": f"data/ruler/niah_multiquery/validation_{v}.jsonl", + } + for k, v in lengths_mapping.items() }, - "ruler_niah_mv": { # NIAH MV + "ruler_niah_mv": { # NIAH MV k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multivalue/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 50, + "test_files": f"data/ruler/niah_multivalue/validation_{v}.jsonl", + } + for k, v in lengths_mapping.items() }, - "ruler_cwe": { # RULER CWE - k: { - "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/cwe/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "ruler_cwe": { # RULER CWE + k: {"input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/cwe/validation_{v}.jsonl"} + for k, v in lengths_mapping.items() }, - "ruler_fwe": { # RULER FWE - k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/fwe/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "ruler_fwe": { # RULER FEW + k: {"input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/few/validation_{v}.jsonl"} + for k, v in lengths_mapping.items() }, - "ruler_vt": { # RULER VT - k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/vt/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "ruler_vt": { # RULER VT + k: {"input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/vt/validation_{v}.jsonl"} + for k, v in lengths_mapping.items() }, - "ruler_niah_qa_1": { # SQuAD - k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_1/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "ruler_niah_qa_1": { # SQuAD + k: {"input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_1/validation_{v}.jsonl"} + for k, v in lengths_mapping.items() }, - "ruler_niah_qa_2": { # HotpotQA - k: { - "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_2/validation_{v}.jsonl" - } for k, v in lengths_mapping.items() + "ruler_niah_qa_2": { # HotpotQA + k: {"input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_2/validation_{v}.jsonl"} + for k, v in lengths_mapping.items() }, - "json_kv": { k: { - "input_length": v, "generation_max_length": 100, "test_files": f"data/json_kv/test_k" + ["50", "105", "220", "440", "900", "1800"][i] + "_dep6.jsonl", "demo_files": "" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 100, + "test_files": "data/json_kv/test_k" + ["50", "105", "220", "440", "900", "1800"][i] + "_dep6.jsonl", + "demo_files": "", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, - # generation with citations -- alce - "alce_asqa": { # ASQA + "alce_asqa": { # ASQA k: { - "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/asqa_eval_gtr_top2000.json", "demo_files": f"prompts/asqa_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 300, + "test_files": "data/alce/asqa_eval_gtr_top2000.json", + "demo_files": "prompts/asqa_revised.json", + "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i], + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, - "alce_qampari": { # QAMPARI + "alce_qampari": { # QAMPARI k: { - "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/qampari_eval_gtr_top2000.json", "demo_files": f"prompts/qampari_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 300, + "test_files": "data/alce/qampari_eval_gtr_top2000.json", + "demo_files": "prompts/qampari_revised.json", + "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i], + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, - # RAG tasks, using KILT's datasets and retrieval corpus "kilt_nq": { k: { @@ -99,95 +132,166 @@ }, "kilt_triviaqa": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "data/kilt/triviaqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", - "demo_files": "data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "data/kilt/triviaqa-dev-multikilt_1000_k" + + ["20", "50", "105", "220", "440", "1000"][i] + + "_dep6.jsonl", + "demo_files": "data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "kilt_hotpotqa": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "data/kilt/hotpotqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep3.jsonl", - "demo_files": "data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "data/kilt/hotpotqa-dev-multikilt_1000_k" + + ["20", "50", "105", "220", "440", "1000"][i] + + "_dep3.jsonl", + "demo_files": "data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "kilt_popqa": { k: { +<<<<<<< HEAD "input_length": v, "generation_max_length": 20, "name_postfix": "_3", "test_files": "data/kilt/popqa_test_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", "demo_files": "data/kilt/popqa_test_1000_k3_dep6.jsonl" } for i, (k, v) in enumerate(lengths_mapping.items()) +======= + "input_length": v, + "generation_max_length": 20, + "name_postfix": "_3", + "test_files": "data/kilt/popqa_test_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", + "demo_files": "data/kilt/popqa_test_1000_k3_dep6.jsonl", + } + for i, (k, v) in enumerate(lengths_mapping.items()) +>>>>>>> 66269c7f0dedd9c69c86471ff01379e64215d13e }, - # for longqa, we truncate by the length - 200 - the generation length "narrativeqa": { k: { - "input_length": v, "generation_max_length": 100, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 100}" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 100, + "test_files": "", + "demo_files": "", + "name_postfix": f"_{v - 200 - 100}", + } + for k, v in lengths_mapping.items() }, "infbench_qa_eng": { k: { - "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 10, + "test_files": "", + "demo_files": "", + "name_postfix": f"_{v - 200 - 10}", + } + for k, v in lengths_mapping.items() }, "infbench_choice_eng": { k: { - "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 10, + "test_files": "", + "demo_files": "", + "name_postfix": f"_{v - 200 - 10}", + } + for k, v in lengths_mapping.items() }, - "infbench_sum_eng": { k: { - "input_length": v, "generation_max_length": 1200, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 1200}" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 1200, + "test_files": "", + "demo_files": "", + "name_postfix": f"_{v - 200 - 1200}", + } + for k, v in lengths_mapping.items() }, # for multi lexsum, we truncate by the length - 300 (prompt and buffer) - 400 (generation) "multi_lexsum": { k: { - "input_length": v, "generation_max_length": 400, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 300 - 400}" - } for k, v in lengths_mapping.items() + "input_length": v, + "generation_max_length": 400, + "test_files": "", + "demo_files": "", + "name_postfix": f"_{v - 300 - 400}", + } + for k, v in lengths_mapping.items() }, - "msmarco_rerank_psg": { k: { +<<<<<<< HEAD "input_length": v, "generation_max_length": 200, "test_files": "data/msmarco/test_reranking_data_k" + ["14", "50", "130", "285", "600", "1000"][i] + "_dep3.jsonl", "demo_files": "data/msmarco/test_reranking_data_k10_dep3.jsonl" } for i, (k, v) in enumerate(lengths_mapping.items()) +======= + "input_length": v, + "generation_max_length": 200, + "test_files": "data/msmarco/test_reranking_data_k" + + ["14", "50", "130", "285", "600", "1000"][i] + + "_dep3.jsonl", + "demo_files": "data/msmarco/test_reranking_data_k10_dep3.jsonl", + } + for i, (k, v) in enumerate(lengths_mapping.items()) +>>>>>>> 66269c7f0dedd9c69c86471ff01379e64215d13e }, - "icl_trec_coarse": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3300", "6600"][i] + "shot_balance" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "", + "demo_files": "", + "name_postfix": "_" + ["200", "400", "800", "1600", "3300", "6600"][i] + "shot_balance", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "icl_trec_fine": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3200", "6400"][i] + "shot_balance" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "", + "demo_files": "", + "name_postfix": "_" + ["200", "400", "800", "1600", "3200", "6400"][i] + "shot_balance", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "icl_banking77": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "", "demo_files": "", "name_postfix": "_" + ["180", "360", "720", "1450", "2900", "5900"][i] + "shot_balance" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "", + "demo_files": "", + "name_postfix": "_" + ["180", "360", "720", "1450", "2900", "5900"][i] + "shot_balance", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "icl_clinic150": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "", "demo_files": "", "name_postfix": "_" + ["220", "440", "880", "1750", "3525", "7050"][i] + "shot_balance" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "", + "demo_files": "", + "name_postfix": "_" + ["220", "440", "880", "1750", "3525", "7050"][i] + "shot_balance", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, "icl_nlu": { k: { - "input_length": v, "generation_max_length": 20, - "test_files": "", "demo_files": "", "name_postfix": "_" + ["250", "510", "1020", "2040", "4080", "8296"][i] + "shot_balance" - } for i, (k, v) in enumerate(lengths_mapping.items()) + "input_length": v, + "generation_max_length": 20, + "test_files": "", + "demo_files": "", + "name_postfix": "_" + ["250", "510", "1020", "2040", "4080", "8296"][i] + "shot_balance", + } + for i, (k, v) in enumerate(lengths_mapping.items()) }, } + def process_configs(config_name, datasets, input_lengths, **kwargs): configs = [] for i, d in enumerate(datasets): @@ -196,27 +300,43 @@ def process_configs(config_name, datasets, input_lengths, **kwargs): for l in input_lengths: c = con[l] print(c) - configs.append({ - "input_max_length": c['input_length'], - "datasets": d + c.get("name_postfix", ""), - "generation_max_length": c['generation_max_length'], - "test_files": c.get("test_files", ""), - "demo_files": c.get("demo_files", ""), - }) + configs.append( + { + "input_max_length": c["input_length"], + "datasets": d + c.get("name_postfix", ""), + "generation_max_length": c["generation_max_length"], + "test_files": c.get("test_files", ""), + "demo_files": c.get("demo_files", ""), + } + ) out_config = {k: ",".join([str(c[k]) for c in configs]) for k in configs[0]} # llama 3 by default but you can change it to anything else +<<<<<<< HEAD out_config.update({ **kwargs, "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", "output_dir": "output/Llama-3.1-8B-Instruct", }) +======= + out_config.update( + { + **kwargs, + "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "output_dir": "output/Llama-3.1-8B-Instruct", + "model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "output_dir": "output/Llama-3.2-1B-Instruct", + } + ) +>>>>>>> 66269c7f0dedd9c69c86471ff01379e64215d13e with open(config_name, "w") as f: yaml.dump(out_config, f, sort_keys=False) -def helmet_configs(input_lengths = ["128k"], fname_postfix = ""): + +def helmet_configs(input_lengths=["128k"], fname_postfix=""): synthetic = ["ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mv", "json_kv"] # ruler actually doesn't support demos so it defaults to 0, json kv uses 2 process_configs( +<<<<<<< HEAD f"configs/recall{fname_postfix}.yaml", synthetic, input_lengths, use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=False ) @@ -225,42 +345,87 @@ def helmet_configs(input_lengths = ["128k"], fname_postfix = ""): process_configs( f"configs/rag{fname_postfix}.yaml", rag, input_lengths, use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True # could be false but set to true so it runs faster +======= + f"configs/recall{fname_postfix}.yaml", + synthetic, + input_lengths, + use_chat_template=False, + max_test_samples=100, + shots=2, + stop_new_line=False, +>>>>>>> 66269c7f0dedd9c69c86471ff01379e64215d13e + ) + + rag = ["kilt_nq", "kilt_triviaqa", "kilt_hotpotqa", "kilt_popqa"] + process_configs( + f"configs/rag{fname_postfix}.yaml", + rag, + input_lengths, + use_chat_template=False, + max_test_samples=100, + shots=2, + stop_new_line=True, # could be false but set to true so it runs faster ) - longqa = ['narrativeqa', 'infbench_qa_eng', 'infbench_choice_eng'] + longqa = ["narrativeqa", "infbench_qa_eng", "infbench_choice_eng"] process_configs( - f"configs/longqa{fname_postfix}.yaml", longqa, input_lengths, - use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False + f"configs/longqa{fname_postfix}.yaml", + longqa, + input_lengths, + use_chat_template=True, + max_test_samples=100, + shots=2, + stop_new_line=False, ) - summ = ['infbench_sum_eng', 'multi_lexsum'] + summ = ["infbench_sum_eng", "multi_lexsum"] process_configs( - f"configs/summ{fname_postfix}.yaml", summ, input_lengths, - use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False + f"configs/summ{fname_postfix}.yaml", + summ, + input_lengths, + use_chat_template=True, + max_test_samples=100, + shots=2, + stop_new_line=False, ) - icl = ['icl_trec_coarse', 'icl_trec_fine', 'icl_banking77', 'icl_clinic150', 'icl_nlu'] + icl = ["icl_trec_coarse", "icl_trec_fine", "icl_banking77", "icl_clinic150", "icl_nlu"] process_configs( - f"configs/icl{fname_postfix}.yaml", icl, input_lengths, - use_chat_template=False, max_test_samples=100, shots=0, stop_new_line=True + f"configs/icl{fname_postfix}.yaml", + icl, + input_lengths, + use_chat_template=False, + max_test_samples=100, + shots=0, + stop_new_line=True, ) rerank = ["msmarco_rerank_psg"] process_configs( - f"configs/rerank{fname_postfix}.yaml", rerank, input_lengths, - use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True + f"configs/rerank{fname_postfix}.yaml", + rerank, + input_lengths, + use_chat_template=False, + max_test_samples=100, + shots=2, + stop_new_line=True, ) cite = ["alce_asqa", "alce_qampari"] process_configs( - f"configs/cite{fname_postfix}.yaml", cite, input_lengths, - use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False + f"configs/cite{fname_postfix}.yaml", + cite, + input_lengths, + use_chat_template=True, + max_test_samples=100, + shots=2, + stop_new_line=False, ) def niah_configs(): input_lengths = [8192, 16384, 32768, 65536, 131072] - dataset=["ruler_niah_s_2"] + dataset = ["ruler_niah_s_2"] gen_lengths = [50] for i, l in enumerate(input_lengths): config = { @@ -270,7 +435,7 @@ def niah_configs(): "test_files": f'data/ruler/{dataset[0].replace("ruler_", "").replace("_s_", "_single_")}/validation_{l}.jsonl', "demo_files": "", } - with open(f"configs/niah.yaml", "w") as f: + with open("configs/niah.yaml", "w") as f: yaml.dump(config, f, sort_keys=False) @@ -278,7 +443,21 @@ def ruler_all_configs(): input_lengths = [4096, 8192, 16384, 32768] input_lengths = [65536, 131072] - dataset=["ruler_niah_s_1", "ruler_niah_s_2", "ruler_niah_s_3", "ruler_niah_mk_1", "ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mq", "ruler_niah_mv", "ruler_cwe", "ruler_fwe", "ruler_vt", "ruler_qa_1", "ruler_qa_2"] + dataset = [ + "ruler_niah_s_1", + "ruler_niah_s_2", + "ruler_niah_s_3", + "ruler_niah_mk_1", + "ruler_niah_mk_2", + "ruler_niah_mk_3", + "ruler_niah_mq", + "ruler_niah_mv", + "ruler_cwe", + "ruler_fwe", + "ruler_vt", + "ruler_qa_1", + "ruler_qa_2", + ] gen_lengths = [50, 50, 50, 50, 50, 100, 100, 50, 100, 50, 50, 50, 50] assert len(dataset) == len(gen_lengths) @@ -286,16 +465,19 @@ def ruler_all_configs(): configs = [] for i, d in enumerate(dataset): for l in input_lengths: - configs.append({ - "input_max_length": l, - "datasets": d, - "generation_max_length": gen_lengths[i], - "test_files": f'data/ruler/{d.replace("ruler_", "").replace("_s_", "_single_").replace("mq", "multiquery").replace("mk", "multikey").replace("mv", "multivalue")}/validation_{l}.jsonl', - "demo_files": "", - }) + configs.append( + { + "input_max_length": l, + "datasets": d, + "generation_max_length": gen_lengths[i], + "test_files": f'data/ruler/{d.replace("ruler_", "").replace("_s_", "_single_").replace("mq", "multiquery").replace("mk", "multikey").replace("mv", "multivalue")}/validation_{l}.jsonl', + "demo_files": "", + } + ) # with open(f"configs/ruler_all{'' if max(input_lengths) <= 2**15 else '_long'}.yaml", "w") as f: with open(f"configs/niah{'' if max(input_lengths) <= 2**15 else '_long'}.yaml", "w") as f: +<<<<<<< HEAD config = { k: ",".join([str(c[k]) for c in configs]) for k in configs[0] } @@ -307,6 +489,19 @@ def ruler_all_configs(): "model_name_or_path": "/scratch/gpfs/hyen/models/Meta-Llama-3.1-8B", "output_dir": "output/Meta-Llama-3.1-8B", }) +======= + config = {k: ",".join([str(c[k]) for c in configs]) for k in configs[0]} + config.update( + { + "use_chat_template": False, + "max_test_samples": 100, + "shots": 0, + "stop_new_line": False, + "model_name_or_path": "/scratch/gpfs/hyen/models/Meta-Llama-3.1-8B", + "output_dir": "output/Meta-Llama-3.1-8B", + } + ) +>>>>>>> 66269c7f0dedd9c69c86471ff01379e64215d13e print(config) yaml.dump(config, f, sort_keys=False) diff --git a/evals/evaluation/HELMET/scripts/run_api.sh b/evals/evaluation/HELMET/scripts/run_api.sh index b7cb267f..d9fedbda 100644 --- a/evals/evaluation/HELMET/scripts/run_api.sh +++ b/evals/evaluation/HELMET/scripts/run_api.sh @@ -1,5 +1,8 @@ #!/bin/bash -l +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ############################## # Job blueprint # ############################## @@ -8,7 +11,7 @@ #SBATCH --job-name=api ## CHANGE JOBNAME HERE #SBATCH --array=0 -# Remove one # to uncommment +# Remove one # to uncomment #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr diff --git a/evals/evaluation/HELMET/scripts/run_eval.sh b/evals/evaluation/HELMET/scripts/run_eval.sh index f9ec07c8..1b9b3ab4 100644 --- a/evals/evaluation/HELMET/scripts/run_eval.sh +++ b/evals/evaluation/HELMET/scripts/run_eval.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do python eval.py --config configs/${task}.yaml done @@ -5,4 +8,4 @@ done this will run the 8k to 64k versions for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do python eval.py --config configs/${task}_short.yaml -done \ No newline at end of file +done diff --git a/evals/evaluation/HELMET/scripts/run_eval_slurm.sh b/evals/evaluation/HELMET/scripts/run_eval_slurm.sh index 474231d5..a889ccf3 100644 --- a/evals/evaluation/HELMET/scripts/run_eval_slurm.sh +++ b/evals/evaluation/HELMET/scripts/run_eval_slurm.sh @@ -1,5 +1,8 @@ #!/bin/bash -l +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ############################## # Job blueprint # ############################## @@ -8,7 +11,7 @@ #SBATCH --job-name=helmet ## CHANGE JOBNAME HERE #SBATCH --array=0-35 -# Remove one # to uncommment +# Remove one # to uncomment #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr @@ -152,4 +155,3 @@ wait; #echo "done, check $OUTPUT_DIR for outputs" #exit 0 - diff --git a/evals/evaluation/HELMET/scripts/run_short_slurm.sh b/evals/evaluation/HELMET/scripts/run_short_slurm.sh index f4d685e6..47c3ce78 100644 --- a/evals/evaluation/HELMET/scripts/run_short_slurm.sh +++ b/evals/evaluation/HELMET/scripts/run_short_slurm.sh @@ -1,5 +1,8 @@ #!/bin/bash -l +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ############################## # Job blueprint # ############################## @@ -8,7 +11,7 @@ #SBATCH --job-name=helmet_short ## CHANGE JOBNAME HERE #SBATCH --array=0 -# Remove one # to uncommment +# Remove one # to uncomment #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr diff --git a/evals/evaluation/HELMET/utils.py b/evals/evaluation/HELMET/utils.py index 7ca2c40d..bd375071 100644 --- a/evals/evaluation/HELMET/utils.py +++ b/evals/evaluation/HELMET/utils.py @@ -1,27 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 """ Adopted from https://github.com/princeton-nlp/DensePhrases/blob/main/densephrases/utils/eval_utils.py """ import os import string +import logging import re -import unicodedata -from collections import Counter +import string import sys - import time -from rouge_score import rouge_scorer +import unicodedata +from collections import Counter +import pytrec_eval import torch import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoModel -import pytrec_eval +from rouge_score import rouge_scorer +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GenerationConfig # import tensor_parallel as tp -import logging -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S') +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -29,14 +30,14 @@ def normalize_answer(s): def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) + return re.sub(r"\b(a|an|the)\b", " ", text) def white_space_fix(text): - return ' '.join(text.split()) + return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) + return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() @@ -54,9 +55,9 @@ def f1_score(prediction, ground_truth): ZERO_METRIC = (0, 0, 0) - if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: + if normalized_prediction in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth: return ZERO_METRIC - if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: + if normalized_ground_truth in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth: return ZERO_METRIC prediction_tokens = normalized_prediction.split() @@ -73,7 +74,7 @@ def f1_score(prediction, ground_truth): def drqa_normalize(text): """Resolve different type of unicode encodings.""" - return unicodedata.normalize('NFD', text) + return unicodedata.normalize("NFD", text) def drqa_exact_match_score(prediction, ground_truth): @@ -81,15 +82,14 @@ def drqa_exact_match_score(prediction, ground_truth): return normalize_answer(prediction) == normalize_answer(ground_truth) -def substring_exact_match_score(prediciton, ground_truth): +def substring_exact_match_score(prediction, ground_truth): """Check if the ground truth is a (soft) exact match substring of the prediction.""" return normalize_answer(ground_truth) in normalize_answer(prediciton) def drqa_metric_max_over_ground_truths(metric_fn, prediction, ground_truths): """Given a prediction and multiple valid answers, return the score of - the best prediction-answer_n pair given a metric function. - """ + the best prediction-answer_n pair given a metric function.""" # ground truth could be a string or a list of strings or a list of list of strings if isinstance(ground_truths, str): ground_truths = [ground_truths] @@ -105,8 +105,8 @@ def drqa_metric_max_over_ground_truths(metric_fn, prediction, ground_truths): def get_max_memory(): """Get the maximum memory available for the current GPU for loading models.""" - free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) - max_memory = f'{free_in_GB-6}GB' + free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3) + max_memory = f"{free_in_GB-6}GB" n_gpus = torch.cuda.device_count() max_memory = {i: max_memory for i in range(n_gpus)} return max_memory @@ -124,12 +124,15 @@ def get_top_tokens(logits, tokenizer, top_k=10): def parse_output(output, prefix="Answer:"): def lstrip_string(s, sub): - return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE) + return re.sub(f"^{re.escape(sub)}", "", s, flags=re.IGNORECASE) + patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")] for pat in patterns: matches = pat.search(output) if matches is not None: - return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix + return lstrip_string( + matches[1].strip(), prefix + ).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix # if still not found, return None, but should actually never get this case... return None @@ -141,7 +144,7 @@ def parse_rankings(output): output = output.lower().replace("id", "") # 2. parse the integer surrounded by >, since all IDs are integers - pattern = r'(\d+)(?:\s*>\s*(\d+))*' + pattern = r"(\d+)(?:\s*>\s*(\d+))*" match = re.finditer(pattern, output) # and take the longest match longest = "" @@ -152,7 +155,7 @@ def parse_rankings(output): if len(longest) > 0: number_string = longest # import to output a list of strings instead of ints, since the IDs are saved as strings (even though they are supposed to be integers) - rankings = [num.strip() for num in number_string.split('>') if num.strip().isdigit()] + rankings = [num.strip() for num in number_string.split(">") if num.strip().isdigit()] else: # if we can't find any numbers, then we just return the whole string (unlikely to get any matches) rankings = [output] @@ -165,7 +168,9 @@ def parse_rankings(output): return results -r_scorer = rouge_scorer.RougeScorer(['rougeL', 'rougeLsum'], use_stemmer=True) +r_scorer = rouge_scorer.RougeScorer(["rougeL", "rougeLsum"], use_stemmer=True) + + def calculate_metrics(prediction, answers): em = drqa_metric_max_over_ground_truths(drqa_exact_match_score, prediction, answers) f1 = drqa_metric_max_over_ground_truths(lambda x, y: f1_score(x, y)[0], prediction, answers) @@ -213,7 +218,9 @@ def calculate_retrieval_metrics(results, qrels, k_values=[1, 5, 10, 25, 50, 100] # https://github.com/cvangysel/pytrec_eval/blob/master/examples/simple_cut.py # qrels = {qid: {'pid': [0/1] (relevance label)}} # results = {qid: {'pid': float (retriever score)}} - evaluator = pytrec_eval.RelevanceEvaluator(qrels, {map_string, ndcg_string, recall_string, precision_string, "recip_rank"}) + evaluator = pytrec_eval.RelevanceEvaluator( + qrels, {map_string, ndcg_string, recall_string, precision_string, "recip_rank"} + ) scores = evaluator.evaluate(results) for query_id in scores.keys(): @@ -221,7 +228,7 @@ def calculate_retrieval_metrics(results, qrels, k_values=[1, 5, 10, 25, 50, 100] ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)] _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)] recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)] - precision[f"P@{k}"] += scores[query_id]["P_"+ str(k)] + precision[f"P@{k}"] += scores[query_id]["P_" + str(k)] mrr["MRR"] += scores[query_id]["recip_rank"] for k in k_values: