From 93cccf996c51f871967f22f97505358ca333aee1 Mon Sep 17 00:00:00 2001
From: Wenxin Zhang <wenxin.zhang@intel.com>
Date: Mon, 13 May 2024 15:55:45 +0800
Subject: [PATCH] update examples

Signed-off-by: Wenxin Zhang <wenxin.zhang@intel.com>
---
 .github/workflows/scripts/models/collect_log.sh      | 12 +++++++-----
 .github/workflows/scripts/models/model_test.sh       |  8 +++++---
 .../{ => examples}/main.py                           |  0
 .../lm_evaluation_harness/{ => examples}/main.py     |  0
 README.md                                            |  6 +++---
 5 files changed, 15 insertions(+), 11 deletions(-)
 rename GenAIEval/evaluation/bigcode_evaluation_harness/{ => examples}/main.py (100%)
 rename GenAIEval/evaluation/lm_evaluation_harness/{ => examples}/main.py (100%)

diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh
index ed18b010..a197393a 100644
--- a/.github/workflows/scripts/models/collect_log.sh
+++ b/.github/workflows/scripts/models/collect_log.sh
@@ -21,6 +21,8 @@ PATTERN='[-a-zA-Z0-9_]*='
 PERF_STABLE_CHECK=true
 for i in "$@"; do
     case $i in
+        --datasets*)
+            datasets=`echo $i | sed "s/${PATTERN}//"`;;
         --device=*)
             device=`echo $i | sed "s/${PATTERN}//"`;;
         --model=*)
@@ -32,14 +34,14 @@ for i in "$@"; do
     esac
 done
 
-output_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}.log"
+log_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}-${datasets}.log"
 $BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET
 
 echo "working in"
 pwd
-if [[ ! -f ${output_file} ]]; then
-    echo "${device};${model};${tasks};;${logfile}" >> ${WORKSPACE}/summary.log
+if [[ ! -f ${log_file} ]]; then
+    echo "${device};${model};${tasks};${datasets};;${logfile}" >> ${WORKSPACE}/summary.log
 else
-    acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${acc_log_name} | head -n 1 | sed 's/.*://;s/[^0-9.]//g')
-    echo "${device};${model};${tasks};${acc};${logfile}" >> ${WORKSPACE}/summary.log
+    acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${log_file} | head -n 1 | sed 's/.*://;s/[^0-9.]//g')
+    echo "${device};${model};${tasks};${datasets};${acc};${logfile}" >> ${WORKSPACE}/summary.log
 fi
diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh
index 3633ab4b..4ba5d0d5 100644
--- a/.github/workflows/scripts/models/model_test.sh
+++ b/.github/workflows/scripts/models/model_test.sh
@@ -42,9 +42,9 @@ $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET
 main() {
     case ${tasks} in
         "text-generation")
-            working_dir="/GenAIEval/evaluation/lm_evaluation_harness";;
+            working_dir="/GenAIEval/GenAIEval/evaluation/lm_evaluation_harness/examples";;
         "code-generation")
-            working_dir="/GenAIEval/evaluation/bigcode_evaluation_harness";;
+            working_dir="/GenAIEval/GenAIEval/evaluation/bigcode_evaluation_harness/examples";;
         *)
             echo "Not suppotted task"; exit 1;;
     esac
@@ -62,11 +62,13 @@ function prepare() {
     else
         echo "Not found requirements.txt file."
     fi
+    if [[ ${device} == "hpu" ]]; then
+        pip install --upgrade-strategy eager optimum[habana]
+    fi
 }
 
 function run_benchmark() {
     cd ${working_dir}
-    pip install --upgrade-strategy eager optimum[habana]
     overall_log="${log_dir}/${device}-${model}-${tasks}-${datasets}.log"
     python main.py \
         --model hf \
diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/main.py b/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py
similarity index 100%
rename from GenAIEval/evaluation/bigcode_evaluation_harness/main.py
rename to GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/main.py b/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/main.py
rename to GenAIEval/evaluation/lm_evaluation_harness/examples/main.py
diff --git a/README.md b/README.md
index 830274de..68744fe1 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation
 ```shell
 
 # pip install --upgrade-strategy eager optimum[habana]
-cd GenAIEval/evaluation/lm_evaluation_harness
+cd GenAIEval/evaluation/lm_evaluation_harness/examples
 python main.py \
     --model gaudi-hf \
     --model_args pretrained=EleutherAI/gpt-j-6B \
@@ -29,7 +29,7 @@ python main.py \
 ##### CPU
 ```shell
 
-cd GenAIEval/evaluation/lm_evaluation_harness
+cd GenAIEval/evaluation/lm_evaluation_harness/examples
 python main.py \
     --model hf \
     --model_args pretrained=EleutherAI/gpt-j-6B \
@@ -57,7 +57,7 @@ For evaluating the models on coding tasks or specifically coding LLMs, we follow
 #### command line usage
 
 ```shell
-cd GenAIEval/evaluation/bigcode_evaluation_harness
+cd GenAIEval/evaluation/bigcode_evaluation_harness/examples
 python main.py \
     --model "codeparrot/codeparrot-small" \
     --tasks "humaneval" \