From 93cccf996c51f871967f22f97505358ca333aee1 Mon Sep 17 00:00:00 2001 From: Wenxin Zhang Date: Mon, 13 May 2024 15:55:45 +0800 Subject: [PATCH] update examples Signed-off-by: Wenxin Zhang --- .github/workflows/scripts/models/collect_log.sh | 12 +++++++----- .github/workflows/scripts/models/model_test.sh | 8 +++++--- .../{ => examples}/main.py | 0 .../lm_evaluation_harness/{ => examples}/main.py | 0 README.md | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) rename GenAIEval/evaluation/bigcode_evaluation_harness/{ => examples}/main.py (100%) rename GenAIEval/evaluation/lm_evaluation_harness/{ => examples}/main.py (100%) diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh index ed18b010..a197393a 100644 --- a/.github/workflows/scripts/models/collect_log.sh +++ b/.github/workflows/scripts/models/collect_log.sh @@ -21,6 +21,8 @@ PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true for i in "$@"; do case $i in + --datasets*) + datasets=`echo $i | sed "s/${PATTERN}//"`;; --device=*) device=`echo $i | sed "s/${PATTERN}//"`;; --model=*) @@ -32,14 +34,14 @@ for i in "$@"; do esac done -output_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}.log" +log_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}-${datasets}.log" $BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET echo "working in" pwd -if [[ ! -f ${output_file} ]]; then - echo "${device};${model};${tasks};;${logfile}" >> ${WORKSPACE}/summary.log +if [[ ! -f ${log_file} ]]; then + echo "${device};${model};${tasks};${datasets};;${logfile}" >> ${WORKSPACE}/summary.log else - acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${acc_log_name} | head -n 1 | sed 's/.*://;s/[^0-9.]//g') - echo "${device};${model};${tasks};${acc};${logfile}" >> ${WORKSPACE}/summary.log + acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${log_file} | head -n 1 | sed 's/.*://;s/[^0-9.]//g') + echo "${device};${model};${tasks};${datasets};${acc};${logfile}" >> ${WORKSPACE}/summary.log fi diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh index 3633ab4b..4ba5d0d5 100644 --- a/.github/workflows/scripts/models/model_test.sh +++ b/.github/workflows/scripts/models/model_test.sh @@ -42,9 +42,9 @@ $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET main() { case ${tasks} in "text-generation") - working_dir="/GenAIEval/evaluation/lm_evaluation_harness";; + working_dir="/GenAIEval/GenAIEval/evaluation/lm_evaluation_harness/examples";; "code-generation") - working_dir="/GenAIEval/evaluation/bigcode_evaluation_harness";; + working_dir="/GenAIEval/GenAIEval/evaluation/bigcode_evaluation_harness/examples";; *) echo "Not suppotted task"; exit 1;; esac @@ -62,11 +62,13 @@ function prepare() { else echo "Not found requirements.txt file." fi + if [[ ${device} == "hpu" ]]; then + pip install --upgrade-strategy eager optimum[habana] + fi } function run_benchmark() { cd ${working_dir} - pip install --upgrade-strategy eager optimum[habana] overall_log="${log_dir}/${device}-${model}-${tasks}-${datasets}.log" python main.py \ --model hf \ diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/main.py b/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py similarity index 100% rename from GenAIEval/evaluation/bigcode_evaluation_harness/main.py rename to GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/main.py b/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/main.py rename to GenAIEval/evaluation/lm_evaluation_harness/examples/main.py diff --git a/README.md b/README.md index 830274de..68744fe1 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation ```shell # pip install --upgrade-strategy eager optimum[habana] -cd GenAIEval/evaluation/lm_evaluation_harness +cd GenAIEval/evaluation/lm_evaluation_harness/examples python main.py \ --model gaudi-hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -29,7 +29,7 @@ python main.py \ ##### CPU ```shell -cd GenAIEval/evaluation/lm_evaluation_harness +cd GenAIEval/evaluation/lm_evaluation_harness/examples python main.py \ --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -57,7 +57,7 @@ For evaluating the models on coding tasks or specifically coding LLMs, we follow #### command line usage ```shell -cd GenAIEval/evaluation/bigcode_evaluation_harness +cd GenAIEval/evaluation/bigcode_evaluation_harness/examples python main.py \ --model "codeparrot/codeparrot-small" \ --tasks "humaneval" \