Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
Signed-off-by: Howard Yen <[email protected]>
  • Loading branch information
howard-yen committed Nov 1, 2024
2 parents e7e5ec9 + 66269c7 commit ccc75c7
Show file tree
Hide file tree
Showing 37 changed files with 1,438 additions and 656 deletions.
4 changes: 2 additions & 2 deletions evals/evaluation/HELMET/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ python eval.py --config configs/cite.yaml --use_vllm
Disclaimer:
VLLM can be much faster than using the native HuggingFace generation; however, we found that the results can be slightly different, so we recommend using the native HuggingFace generation for the final evaluation.
All reported results in the paper are from the native HuggingFace generation.
The speedup is much more noticable for tasks that generates more tokens (e.g., summarization may see up to 2x speedup), whereas the speedup is less noticable for tasks that generate fewer tokens (e.g., JSON KV may see less than 5% speedup).
The speedup is much more noticeable for tasks that generates more tokens (e.g., summarization may see up to 2x speedup), whereas the speedup is less noticeable for tasks that generate fewer tokens (e.g., JSON KV may see less than 5% speedup).

</details>

Expand Down Expand Up @@ -211,7 +211,7 @@ Please also cite the original dataset creators, listed below:
@inproceedings{mallen-etal-2023-trust,
title = "When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories",
author = "Mallen, Alex and
Asai, Akari and
Asia, Akari and
Zhong, Victor and
Das, Rajarshi and
Khashabi, Daniel and
Expand Down
51 changes: 44 additions & 7 deletions evals/evaluation/HELMET/arguments.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import yaml
import ast
import os

import yaml


def parse_arguments():
parser = argparse.ArgumentParser(description="evaluation on downstream tasks")
parser.add_argument("--config", type=str, default=None, help="path to config file")
Expand All @@ -27,27 +32,59 @@ def parse_arguments():

# evaluation settings
parser.add_argument("--shots", type=int, default=5, help="total number of demos (encoder + decoder)")
parser.add_argument("--input_max_length", type=str, default='8192', help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets")
parser.add_argument(
"--input_max_length",
type=str,
default="8192",
help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets",
)

# generation settings
parser.add_argument("--do_sample", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use sampling (false is greedy), overwrites temperature")
parser.add_argument("--generation_max_length", type=str, default='10', help="max number of tokens to generate, can be separated by comma to match the specified datasets")
parser.add_argument(
"--do_sample",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to use sampling (false is greedy), overwrites temperature",
)
parser.add_argument(
"--generation_max_length",
type=str,
default="10",
help="max number of tokens to generate, can be separated by comma to match the specified datasets",
)
parser.add_argument("--generation_min_length", type=int, default=0, help="min number of tokens to generate")
parser.add_argument("--temperature", type=float, default=1.0, help="generation temperature")
parser.add_argument("--top_p", type=float, default=1.0, help="top-p parameter for nucleus sampling")
parser.add_argument("--stop_newline", type=ast.literal_eval, choices=[True, False], default=False, help="whether to stop generation at newline")
parser.add_argument(
"--stop_newline",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to stop generation at newline",
)

# model specific settings
parser.add_argument("--seed", type=int, default=42, help="random seed")
parser.add_argument("--no_cuda", action="store_true", help="disable cuda")
parser.add_argument("--no_bf16", action="store_true", help="disable bf16 and use fp32")
parser.add_argument("--no_torch_compile", action="store_true", help="disable cuda")
parser.add_argument("--use_chat_template", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use chat template")
parser.add_argument(
"--use_chat_template",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to use chat template",
)
parser.add_argument("--rope_theta", type=int, default=None, help="override rope theta")

# misc
parser.add_argument("--debug", action="store_true", help="for debugging")
parser.add_argument("--count_tokens", action="store_true", help="instead of running generation, just count the number of tokens (only for HF models not API)")
parser.add_argument(
"--count_tokens",
action="store_true",
help="instead of running generation, just count the number of tokens (only for HF models not API)",
)

args = parser.parse_args()
config = yaml.safe_load(open(args.config)) if args.config is not None else {}
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/cite.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072
datasets: alce_asqa_700,alce_qampari_700
generation_max_length: 300,300
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/cite_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
datasets: alce_asqa_30,alce_asqa_75,alce_asqa_165,alce_asqa_345,alce_qampari_30,alce_qampari_75,alce_qampari_165,alce_qampari_345
generation_max_length: 300,300,300,300,300,300,300,300
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/icl.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072,131072
datasets: icl_trec_coarse_6600shot_balance,icl_trec_fine_6400shot_balance,icl_banking77_5900shot_balance,icl_clinic150_7050shot_balance,icl_nlu_8296shot_balance
generation_max_length: 20,20,20,20,20
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/icl_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: icl_trec_coarse_400shot_balance,icl_trec_coarse_800shot_balance,icl_trec_coarse_1600shot_balance,icl_trec_coarse_3300shot_balance,icl_trec_fine_400shot_balance,icl_trec_fine_800shot_balance,icl_trec_fine_1600shot_balance,icl_trec_fine_3200shot_balance,icl_banking77_360shot_balance,icl_banking77_720shot_balance,icl_banking77_1450shot_balance,icl_banking77_2900shot_balance,icl_clinic150_440shot_balance,icl_clinic150_880shot_balance,icl_clinic150_1750shot_balance,icl_clinic150_3525shot_balance,icl_nlu_510shot_balance,icl_nlu_1020shot_balance,icl_nlu_2040shot_balance,icl_nlu_4080shot_balance
generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/longqa.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072
datasets: narrativeqa_130772,infbench_qa_eng_130862,infbench_choice_eng_130862
generation_max_length: 100,10,10
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/longqa_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: narrativeqa_7892,narrativeqa_16084,narrativeqa_32468,narrativeqa_65236,infbench_qa_eng_7982,infbench_qa_eng_16174,infbench_qa_eng_32558,infbench_qa_eng_65326,infbench_choice_eng_7982,infbench_choice_eng_16174,infbench_choice_eng_32558,infbench_choice_eng_65326
generation_max_length: 100,100,100,100,10,10,10,10,10,10,10,10
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/niah.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072
datasets: ruler_niah_s_2
generation_max_length: 50
Expand Down
5 changes: 4 additions & 1 deletion evals/evaluation/HELMET/configs/niah_long.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072
datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2
generation_max_length: 50,50,50,50,50,50,50,50,50,50,100,100,100,100,50,50,100,100,50,50,50,50,50,50,50,50
test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl
test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/few/validation_65536.jsonl,data/ruler/few/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl
demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,'
use_chat_template: false
max_test_samples: 100
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/rag.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072
datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3
generation_max_length: 20,20,20,20
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/rag_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: kilt_nq,kilt_nq,kilt_nq,kilt_nq,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3
generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/recall.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072
datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv
generation_max_length: 50,100,50,100
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/recall_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,json_kv,json_kv,json_kv,json_kv
generation_max_length: 50,50,50,50,100,100,100,100,50,50,50,50,100,100,100,100
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/rerank.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: '131072'
datasets: msmarco_rerank_psg
generation_max_length: '200'
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/rerank_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536
datasets: msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg
generation_max_length: 200,200,200,200
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/summ.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072
datasets: infbench_sum_eng_129672,multi_lexsum_130372
generation_max_length: 1200,400
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/HELMET/configs/summ_short.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
datasets: infbench_sum_eng_6792,infbench_sum_eng_14984,infbench_sum_eng_31368,infbench_sum_eng_64136,multi_lexsum_7492,multi_lexsum_15684,multi_lexsum_32068,multi_lexsum_64836
generation_max_length: 1200,1200,1200,1200,400,400,400,400
Expand Down
Loading

0 comments on commit ccc75c7

Please sign in to comment.