From 65efc80fdf0e88353bcf730715b6d5980338a4a7 Mon Sep 17 00:00:00 2001
From: dsbarinov1 <dsbarinov_1@edu.hse.ru>
Date: Fri, 26 Jan 2024 10:46:48 +0000
Subject: [PATCH 1/3] Rename endpoints accuracy tests and modify datasets

---
 tests/config.py                               | 18 ++--
 tests/endpoints.json                          | 97 +++++++++++++++++++
 tests/run_docker_tests.py                     | 57 +----------
 ...st_endpoint.py => smoke_accuracy_tests.py} | 11 +--
 4 files changed, 116 insertions(+), 67 deletions(-)
 create mode 100644 tests/endpoints.json
 rename tests/{unittest_endpoint.py => smoke_accuracy_tests.py} (96%)

diff --git a/tests/config.py b/tests/config.py
index 84baf406c5..aff1e2d0c1 100644
--- a/tests/config.py
+++ b/tests/config.py
@@ -2,22 +2,22 @@
     "gsm8k": {
         "llama-2-13b-chat-fp16": 2,
         "llama-2-70b-chat-int4": 3,
-        "llama-2-70b-chat-fp16": 4,
+        "llama-2-70b-chat-fp16": 3,
         "codellama-7b-instruct-fp16": 1,
         "codellama-13b-instruct-fp16": 2,
         "codellama-34b-instruct-int4": 3,
-        "codellama-34b-instruct-fp16": 4,
+        "codellama-34b-instruct-fp16": 3,
         "mistral-7b-instruct-fp16": 3,
         "mixtral-8x7b-instruct-fp16": 3,
     },
     "triviaqa": {
-        "llama-2-13b-chat-fp16": 3,
-        "llama-2-70b-chat-int4": 4,
-        "llama-2-70b-chat-fp16": 5,
-        "codellama-7b-instruct-fp16": 2,
-        "codellama-13b-instruct-fp16": 3,
-        "codellama-34b-instruct-int4": 4,
-        "codellama-34b-instruct-fp16": 5,
+        "llama-2-13b-chat-fp16": 2,
+        "llama-2-70b-chat-int4": 3,
+        "llama-2-70b-chat-fp16": 4,
+        "codellama-7b-instruct-fp16": 1,
+        "codellama-13b-instruct-fp16": 2,
+        "codellama-34b-instruct-int4": 3,
+        "codellama-34b-instruct-fp16": 3,
         "mistral-7b-instruct-fp16": 3,
         "mixtral-8x7b-instruct-fp16": 3,
     },
diff --git a/tests/endpoints.json b/tests/endpoints.json
new file mode 100644
index 0000000000..9ad7a1a22c
--- /dev/null
+++ b/tests/endpoints.json
@@ -0,0 +1,97 @@
+{
+  "Dev": [
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "codellama-7b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "codellama-13b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "codellama-34b-instruct-int4",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "codellama-34b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "llama-2-13b-chat-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "llama-2-70b-chat-int4",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "llama-2-70b-chat-fp16",
+      "context_size": 4096 
+    },
+    {
+      "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+      "model": "mistral-7b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+        "url": "https://text.customer-endpoints.nimbus.octoml.ai",
+        "model": "mixtral-8x7b-instruct-fp16",
+        "context_size": 4096
+    }
+  ],
+
+  "Prod": [
+    {
+      "url": "https://text.octoai.run",
+      "model": "codellama-7b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "codellama-13b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "codellama-34b-instruct-int4",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "codellama-34b-instruct-fp16",
+      "context_size": 16384
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "llama-2-13b-chat-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "llama-2-70b-chat-int4",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "llama-2-70b-chat-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "mistral-7b-instruct-fp16",
+      "context_size": 4096
+    },
+    {
+      "url": "https://text.octoai.run",
+      "model": "mixtral-8x7b-instruct-fp16",
+      "context_size": 4096
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/run_docker_tests.py b/tests/run_docker_tests.py
index 64d8c46b84..e76ab8bb6e 100644
--- a/tests/run_docker_tests.py
+++ b/tests/run_docker_tests.py
@@ -2,61 +2,14 @@
 import subprocess
 import os
 
-endpoints_data = {
-    "dev": [
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "codellama-7b-instruct-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "codellama-13b-instruct-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "codellama-34b-instruct-int4",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "codellama-34b-instruct-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "llama-2-13b-chat-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "llama-2-70b-chat-int4",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "llama-2-70b-chat-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "mistral-7b-instruct-fp16",
-        },
-        {
-            "url": "https://text.customer-endpoints.nimbus.octoml.ai",
-            "model": "mixstral-8x7b-instruct-fp16",
-        },
-    ],
-    "prod": [
-        {"url": "https://text.octoai.run", "model": "codellama-7b-instruct-fp16"},
-        {"url": "https://text.octoai.run", "model": "codellama-13b-instruct-fp16"},
-        {"url": "https://text.octoai.run", "model": "codellama-34b-instruct-int4"},
-        {"url": "https://text.octoai.run", "model": "codellama-34b-instruct-fp16"},
-        {"url": "https://text.octoai.run", "model": "llama-2-13b-chat-fp16"},
-        {"url": "https://text.octoai.run", "model": "llama-2-70b-chat-int4"},
-        {"url": "https://text.octoai.run", "model": "llama-2-70b-chat-fp16"},
-        {"url": "https://text.octoai.run", "model": "mistral-7b-instruct-fp16"},
-    ],
-}
+with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'endpoints.json'), 'r') as f:
+    # Загружаем данные из файла
+    endpoints_data = json.load(f)
 
 token = os.getenv("OCTOAI_TOKEN")
 current_directory = os.getcwd()
 path = "test_results"
-prod = "prod"
+prod = "Prod"
 
 if not os.path.exists(os.path.join(current_directory, path)):
     os.makedirs(path)
@@ -67,6 +20,6 @@
     model_name = model_info["model"]
     model_url = model_info["url"]
 
-    docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/unittest_endpoint.py -vv --model_name {model_name} --endpoint {model_url}"
+    docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/smoke_accuracy_tests.py -vv --model_name {model_name} --endpoint {model_url}"
 
     subprocess.run(docker_command, shell=True)
diff --git a/tests/unittest_endpoint.py b/tests/smoke_accuracy_tests.py
similarity index 96%
rename from tests/unittest_endpoint.py
rename to tests/smoke_accuracy_tests.py
index 36f1addb3f..9ccf120f51 100644
--- a/tests/unittest_endpoint.py
+++ b/tests/smoke_accuracy_tests.py
@@ -35,12 +35,12 @@ def check_output(model_name, task_name, num_fewshot):
                     ), f"Found the wrong answer or the incorrect scoring case:\nPredicted:\n{i['logit_0']}\nTruth:\n{i['truth']}"
 
             result = (f"test_endpoint_{task_name}", model_name, "PASSED")
+            write_results_to_csv(result)
         except AssertionError as e:
             result = (f"test_endpoint_{task_name}", model_name, f"FAILED: {str(e)}")
+            write_results_to_csv(result)
             raise e
 
-        write_results_to_csv(result)
-
 
 @pytest.fixture
 def model_name(request):
@@ -64,15 +64,14 @@ def test_endpoint_availability(model_name, endpoint, token):
     ]
 
     try:
-        assert run_chat_completion(model_name, messages, token, endpoint) == 200
-
         result = ("test_endpoint_availability", model_name, "PASSED")
+        assert run_chat_completion(model_name, messages, token, endpoint) == 200
+        write_results_to_csv(result)
     except AssertionError as e:
         result = ("test_endpoint_availability", model_name, f"FAILED: {str(e)}")
+        write_results_to_csv(result)
         raise e
 
-    write_results_to_csv(result)
-
 
 def test_endpoint_gsm8k(model_name, endpoint, token):
     num_fewshot = 0

From 01253cf54019798f33148d683fd18babba4bc977 Mon Sep 17 00:00:00 2001
From: dsbarinov1 <dsbarinov_1@edu.hse.ru>
Date: Mon, 12 Feb 2024 07:50:39 +0000
Subject: [PATCH 2/3] Support (dev) llamaguard in smoke accuracy tests

---
 lm_eval/tasks/__init__.py                     | 16 +++++---
 lm_eval/tasks/gsm8k_truncated_llamaguard.py   | 37 ++++++++++++++++++
 .../tasks/triviaqa_truncated_llamaguard.py    | 39 +++++++++++++++++++
 3 files changed, 86 insertions(+), 6 deletions(-)
 create mode 100644 lm_eval/tasks/gsm8k_truncated_llamaguard.py
 create mode 100644 lm_eval/tasks/triviaqa_truncated_llamaguard.py

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index d18540f03e..5c16eb3855 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -30,6 +30,7 @@
 from . import triviaqa_truncated_13b
 from . import triviaqa_truncated_70b
 from . import triviaqa_truncated_llama
+from . import triviaqa_truncated_llamaguard
 from . import triviaqa_truncated_codellama
 from . import triviaqa_truncated_mistral
 from . import triviaqa_truncated_mixtral
@@ -60,6 +61,7 @@
 from . import gsm8k_truncated_13b
 from . import gsm8k_truncated_70b
 from . import gsm8k_truncated_llama
+from . import gsm8k_truncated_llamaguard
 from . import gsm8k_truncated_codellama
 from . import gsm8k_truncated_mistral
 from . import gsm8k_truncated_mixtral
@@ -148,10 +150,11 @@
     "qa4mre_2012": qa4mre.QA4MRE_2012,
     "qa4mre_2013": qa4mre.QA4MRE_2013,
     "triviaqa": triviaqa.TriviaQA,
-    "triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA,
-    "triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA,
-    "triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA,
+    # "triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA,
+    # "triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA,
+    # "triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA,
     "triviaqa_truncated_llama": triviaqa_truncated_llama.TruncatedTriviaQA,
+    "triviaqa_truncated_llamaguard": triviaqa_truncated_llamaguard.TruncatedTriviaQA,
     "triviaqa_truncated_codellama": triviaqa_truncated_codellama.TruncatedTriviaQA,
     "triviaqa_truncated_mistral": triviaqa_truncated_mistral.TruncatedTriviaQA,
     "triviaqa_truncated_mixtral": triviaqa_truncated_mixtral.TruncatedTriviaQA,
@@ -196,10 +199,11 @@
     "math_precalc": hendrycks_math.MathPrecalculus,
     "math_asdiv": asdiv.Asdiv,
     "gsm8k": gsm8k.GradeSchoolMath8K,
-    "gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K,
-    "gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K,
-    "gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K,
+    # "gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K,
+    # "gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K,
+    # "gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K,
     "gsm8k_truncated_llama": gsm8k_truncated_llama.TruncatedGradeSchoolMath8K,
+    "gsm8k_truncated_llamaguard": gsm8k_truncated_llamaguard.TruncatedGradeSchoolMath8K,
     "gsm8k_truncated_codellama": gsm8k_truncated_codellama.TruncatedGradeSchoolMath8K,
     "gsm8k_truncated_mistral": gsm8k_truncated_mistral.TruncatedGradeSchoolMath8K,
     "gsm8k_truncated_mixtral": gsm8k_truncated_mixtral.TruncatedGradeSchoolMath8K,
diff --git a/lm_eval/tasks/gsm8k_truncated_llamaguard.py b/lm_eval/tasks/gsm8k_truncated_llamaguard.py
new file mode 100644
index 0000000000..3bf327aebf
--- /dev/null
+++ b/lm_eval/tasks/gsm8k_truncated_llamaguard.py
@@ -0,0 +1,37 @@
+import datasets
+from . import gsm8k
+from pathlib import Path
+
+import os
+
+
+class TruncatedGradeSchoolMath8K(gsm8k.GradeSchoolMath8K):
+    # Go up two directory levels
+    parent_dir = Path(__file__).parents[2]
+
+    # Define the path relative to the script location
+    relative_path = "tests/testdata/gsm8k_truncated_llamaguard.json"
+    DATASET_PATH = os.path.join(parent_dir, relative_path)
+
+    def has_training_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return NotImplementedError
+
+    def test_docs(self):
+        return self.dataset
+
+    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+        self.dataset = datasets.load_dataset(
+            "json",
+            data_files=self.DATASET_PATH,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            split="train",
+        )
+
+        print(self.dataset)
diff --git a/lm_eval/tasks/triviaqa_truncated_llamaguard.py b/lm_eval/tasks/triviaqa_truncated_llamaguard.py
new file mode 100644
index 0000000000..e7346fda84
--- /dev/null
+++ b/lm_eval/tasks/triviaqa_truncated_llamaguard.py
@@ -0,0 +1,39 @@
+import datasets
+from . import triviaqa
+
+import os
+
+
+class TruncatedTriviaQA(triviaqa.TriviaQA):
+    # Get the directory where the script is located
+    script_directory = os.path.dirname(os.path.abspath(__file__))
+
+    # Go up two directory levels
+    parent_dir = os.path.join(script_directory, os.pardir, os.pardir)
+
+    # Define the path relative to the script location
+    relative_path = "tests/testdata/triviaqa_truncated_llamaguard.json"
+    DATASET_PATH = os.path.join(parent_dir, relative_path)
+
+    def has_training_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return NotImplementedError
+
+    def test_docs(self):
+        return self.dataset
+
+    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+        self.dataset = datasets.load_dataset(
+            "json",
+            data_files=self.DATASET_PATH,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            split="train",
+        )
+
+        print(self.dataset)

From b27d40a6c5b86b2a544ae7ba2fd9fef900efbb46 Mon Sep 17 00:00:00 2001
From: dsbarinov1 <dsbarinov_1@edu.hse.ru>
Date: Mon, 12 Feb 2024 07:54:09 +0000
Subject: [PATCH 3/3] Add llamaguard to config.py

---
 tests/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/config.py b/tests/config.py
index aff1e2d0c1..0a71b79b5f 100644
--- a/tests/config.py
+++ b/tests/config.py
@@ -9,6 +9,7 @@
         "codellama-34b-instruct-fp16": 3,
         "mistral-7b-instruct-fp16": 3,
         "mixtral-8x7b-instruct-fp16": 3,
+        "llamaguard-7b-fp16": 1,
     },
     "triviaqa": {
         "llama-2-13b-chat-fp16": 2,
@@ -20,5 +21,6 @@
         "codellama-34b-instruct-fp16": 3,
         "mistral-7b-instruct-fp16": 3,
         "mixtral-8x7b-instruct-fp16": 3,
+        "llamaguard-7b-fp16": 3,
     },
 }