From 65efc80fdf0e88353bcf730715b6d5980338a4a7 Mon Sep 17 00:00:00 2001 From: dsbarinov1 Date: Fri, 26 Jan 2024 10:46:48 +0000 Subject: [PATCH 1/3] Rename endpoints accuracy tests and modify datasets --- tests/config.py | 18 ++-- tests/endpoints.json | 97 +++++++++++++++++++ tests/run_docker_tests.py | 57 +---------- ...st_endpoint.py => smoke_accuracy_tests.py} | 11 +-- 4 files changed, 116 insertions(+), 67 deletions(-) create mode 100644 tests/endpoints.json rename tests/{unittest_endpoint.py => smoke_accuracy_tests.py} (96%) diff --git a/tests/config.py b/tests/config.py index 84baf406c5..aff1e2d0c1 100644 --- a/tests/config.py +++ b/tests/config.py @@ -2,22 +2,22 @@ "gsm8k": { "llama-2-13b-chat-fp16": 2, "llama-2-70b-chat-int4": 3, - "llama-2-70b-chat-fp16": 4, + "llama-2-70b-chat-fp16": 3, "codellama-7b-instruct-fp16": 1, "codellama-13b-instruct-fp16": 2, "codellama-34b-instruct-int4": 3, - "codellama-34b-instruct-fp16": 4, + "codellama-34b-instruct-fp16": 3, "mistral-7b-instruct-fp16": 3, "mixtral-8x7b-instruct-fp16": 3, }, "triviaqa": { - "llama-2-13b-chat-fp16": 3, - "llama-2-70b-chat-int4": 4, - "llama-2-70b-chat-fp16": 5, - "codellama-7b-instruct-fp16": 2, - "codellama-13b-instruct-fp16": 3, - "codellama-34b-instruct-int4": 4, - "codellama-34b-instruct-fp16": 5, + "llama-2-13b-chat-fp16": 2, + "llama-2-70b-chat-int4": 3, + "llama-2-70b-chat-fp16": 4, + "codellama-7b-instruct-fp16": 1, + "codellama-13b-instruct-fp16": 2, + "codellama-34b-instruct-int4": 3, + "codellama-34b-instruct-fp16": 3, "mistral-7b-instruct-fp16": 3, "mixtral-8x7b-instruct-fp16": 3, }, diff --git a/tests/endpoints.json b/tests/endpoints.json new file mode 100644 index 0000000000..9ad7a1a22c --- /dev/null +++ b/tests/endpoints.json @@ -0,0 +1,97 @@ +{ + "Dev": [ + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "codellama-7b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "codellama-13b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "codellama-34b-instruct-int4", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "codellama-34b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "llama-2-13b-chat-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "llama-2-70b-chat-int4", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "llama-2-70b-chat-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "mistral-7b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.customer-endpoints.nimbus.octoml.ai", + "model": "mixtral-8x7b-instruct-fp16", + "context_size": 4096 + } + ], + + "Prod": [ + { + "url": "https://text.octoai.run", + "model": "codellama-7b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "codellama-13b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "codellama-34b-instruct-int4", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "codellama-34b-instruct-fp16", + "context_size": 16384 + }, + { + "url": "https://text.octoai.run", + "model": "llama-2-13b-chat-fp16", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "llama-2-70b-chat-int4", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "llama-2-70b-chat-fp16", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "mistral-7b-instruct-fp16", + "context_size": 4096 + }, + { + "url": "https://text.octoai.run", + "model": "mixtral-8x7b-instruct-fp16", + "context_size": 4096 + } + ] +} \ No newline at end of file diff --git a/tests/run_docker_tests.py b/tests/run_docker_tests.py index 64d8c46b84..e76ab8bb6e 100644 --- a/tests/run_docker_tests.py +++ b/tests/run_docker_tests.py @@ -2,61 +2,14 @@ import subprocess import os -endpoints_data = { - "dev": [ - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "codellama-7b-instruct-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "codellama-13b-instruct-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "codellama-34b-instruct-int4", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "codellama-34b-instruct-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "llama-2-13b-chat-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "llama-2-70b-chat-int4", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "llama-2-70b-chat-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "mistral-7b-instruct-fp16", - }, - { - "url": "https://text.customer-endpoints.nimbus.octoml.ai", - "model": "mixstral-8x7b-instruct-fp16", - }, - ], - "prod": [ - {"url": "https://text.octoai.run", "model": "codellama-7b-instruct-fp16"}, - {"url": "https://text.octoai.run", "model": "codellama-13b-instruct-fp16"}, - {"url": "https://text.octoai.run", "model": "codellama-34b-instruct-int4"}, - {"url": "https://text.octoai.run", "model": "codellama-34b-instruct-fp16"}, - {"url": "https://text.octoai.run", "model": "llama-2-13b-chat-fp16"}, - {"url": "https://text.octoai.run", "model": "llama-2-70b-chat-int4"}, - {"url": "https://text.octoai.run", "model": "llama-2-70b-chat-fp16"}, - {"url": "https://text.octoai.run", "model": "mistral-7b-instruct-fp16"}, - ], -} +with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'endpoints.json'), 'r') as f: + # Загружаем данные из файла + endpoints_data = json.load(f) token = os.getenv("OCTOAI_TOKEN") current_directory = os.getcwd() path = "test_results" -prod = "prod" +prod = "Prod" if not os.path.exists(os.path.join(current_directory, path)): os.makedirs(path) @@ -67,6 +20,6 @@ model_name = model_info["model"] model_url = model_info["url"] - docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/unittest_endpoint.py -vv --model_name {model_name} --endpoint {model_url}" + docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/smoke_accuracy_tests.py -vv --model_name {model_name} --endpoint {model_url}" subprocess.run(docker_command, shell=True) diff --git a/tests/unittest_endpoint.py b/tests/smoke_accuracy_tests.py similarity index 96% rename from tests/unittest_endpoint.py rename to tests/smoke_accuracy_tests.py index 36f1addb3f..9ccf120f51 100644 --- a/tests/unittest_endpoint.py +++ b/tests/smoke_accuracy_tests.py @@ -35,12 +35,12 @@ def check_output(model_name, task_name, num_fewshot): ), f"Found the wrong answer or the incorrect scoring case:\nPredicted:\n{i['logit_0']}\nTruth:\n{i['truth']}" result = (f"test_endpoint_{task_name}", model_name, "PASSED") + write_results_to_csv(result) except AssertionError as e: result = (f"test_endpoint_{task_name}", model_name, f"FAILED: {str(e)}") + write_results_to_csv(result) raise e - write_results_to_csv(result) - @pytest.fixture def model_name(request): @@ -64,15 +64,14 @@ def test_endpoint_availability(model_name, endpoint, token): ] try: - assert run_chat_completion(model_name, messages, token, endpoint) == 200 - result = ("test_endpoint_availability", model_name, "PASSED") + assert run_chat_completion(model_name, messages, token, endpoint) == 200 + write_results_to_csv(result) except AssertionError as e: result = ("test_endpoint_availability", model_name, f"FAILED: {str(e)}") + write_results_to_csv(result) raise e - write_results_to_csv(result) - def test_endpoint_gsm8k(model_name, endpoint, token): num_fewshot = 0 From 01253cf54019798f33148d683fd18babba4bc977 Mon Sep 17 00:00:00 2001 From: dsbarinov1 Date: Mon, 12 Feb 2024 07:50:39 +0000 Subject: [PATCH 2/3] Support (dev) llamaguard in smoke accuracy tests --- lm_eval/tasks/__init__.py | 16 +++++--- lm_eval/tasks/gsm8k_truncated_llamaguard.py | 37 ++++++++++++++++++ .../tasks/triviaqa_truncated_llamaguard.py | 39 +++++++++++++++++++ 3 files changed, 86 insertions(+), 6 deletions(-) create mode 100644 lm_eval/tasks/gsm8k_truncated_llamaguard.py create mode 100644 lm_eval/tasks/triviaqa_truncated_llamaguard.py diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index d18540f03e..5c16eb3855 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -30,6 +30,7 @@ from . import triviaqa_truncated_13b from . import triviaqa_truncated_70b from . import triviaqa_truncated_llama +from . import triviaqa_truncated_llamaguard from . import triviaqa_truncated_codellama from . import triviaqa_truncated_mistral from . import triviaqa_truncated_mixtral @@ -60,6 +61,7 @@ from . import gsm8k_truncated_13b from . import gsm8k_truncated_70b from . import gsm8k_truncated_llama +from . import gsm8k_truncated_llamaguard from . import gsm8k_truncated_codellama from . import gsm8k_truncated_mistral from . import gsm8k_truncated_mixtral @@ -148,10 +150,11 @@ "qa4mre_2012": qa4mre.QA4MRE_2012, "qa4mre_2013": qa4mre.QA4MRE_2013, "triviaqa": triviaqa.TriviaQA, - "triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA, - "triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA, - "triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA, + # "triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA, + # "triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA, + # "triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA, "triviaqa_truncated_llama": triviaqa_truncated_llama.TruncatedTriviaQA, + "triviaqa_truncated_llamaguard": triviaqa_truncated_llamaguard.TruncatedTriviaQA, "triviaqa_truncated_codellama": triviaqa_truncated_codellama.TruncatedTriviaQA, "triviaqa_truncated_mistral": triviaqa_truncated_mistral.TruncatedTriviaQA, "triviaqa_truncated_mixtral": triviaqa_truncated_mixtral.TruncatedTriviaQA, @@ -196,10 +199,11 @@ "math_precalc": hendrycks_math.MathPrecalculus, "math_asdiv": asdiv.Asdiv, "gsm8k": gsm8k.GradeSchoolMath8K, - "gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K, - "gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K, - "gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K, + # "gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K, + # "gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K, + # "gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K, "gsm8k_truncated_llama": gsm8k_truncated_llama.TruncatedGradeSchoolMath8K, + "gsm8k_truncated_llamaguard": gsm8k_truncated_llamaguard.TruncatedGradeSchoolMath8K, "gsm8k_truncated_codellama": gsm8k_truncated_codellama.TruncatedGradeSchoolMath8K, "gsm8k_truncated_mistral": gsm8k_truncated_mistral.TruncatedGradeSchoolMath8K, "gsm8k_truncated_mixtral": gsm8k_truncated_mixtral.TruncatedGradeSchoolMath8K, diff --git a/lm_eval/tasks/gsm8k_truncated_llamaguard.py b/lm_eval/tasks/gsm8k_truncated_llamaguard.py new file mode 100644 index 0000000000..3bf327aebf --- /dev/null +++ b/lm_eval/tasks/gsm8k_truncated_llamaguard.py @@ -0,0 +1,37 @@ +import datasets +from . import gsm8k +from pathlib import Path + +import os + + +class TruncatedGradeSchoolMath8K(gsm8k.GradeSchoolMath8K): + # Go up two directory levels + parent_dir = Path(__file__).parents[2] + + # Define the path relative to the script location + relative_path = "tests/testdata/gsm8k_truncated_llamaguard.json" + DATASET_PATH = os.path.join(parent_dir, relative_path) + + def has_training_docs(self): + return False + + def has_test_docs(self): + return True + + def training_docs(self): + return NotImplementedError + + def test_docs(self): + return self.dataset + + def download(self, data_dir=None, cache_dir=None, download_mode=None): + self.dataset = datasets.load_dataset( + "json", + data_files=self.DATASET_PATH, + data_dir=data_dir, + cache_dir=cache_dir, + split="train", + ) + + print(self.dataset) diff --git a/lm_eval/tasks/triviaqa_truncated_llamaguard.py b/lm_eval/tasks/triviaqa_truncated_llamaguard.py new file mode 100644 index 0000000000..e7346fda84 --- /dev/null +++ b/lm_eval/tasks/triviaqa_truncated_llamaguard.py @@ -0,0 +1,39 @@ +import datasets +from . import triviaqa + +import os + + +class TruncatedTriviaQA(triviaqa.TriviaQA): + # Get the directory where the script is located + script_directory = os.path.dirname(os.path.abspath(__file__)) + + # Go up two directory levels + parent_dir = os.path.join(script_directory, os.pardir, os.pardir) + + # Define the path relative to the script location + relative_path = "tests/testdata/triviaqa_truncated_llamaguard.json" + DATASET_PATH = os.path.join(parent_dir, relative_path) + + def has_training_docs(self): + return False + + def has_test_docs(self): + return True + + def training_docs(self): + return NotImplementedError + + def test_docs(self): + return self.dataset + + def download(self, data_dir=None, cache_dir=None, download_mode=None): + self.dataset = datasets.load_dataset( + "json", + data_files=self.DATASET_PATH, + data_dir=data_dir, + cache_dir=cache_dir, + split="train", + ) + + print(self.dataset) From b27d40a6c5b86b2a544ae7ba2fd9fef900efbb46 Mon Sep 17 00:00:00 2001 From: dsbarinov1 Date: Mon, 12 Feb 2024 07:54:09 +0000 Subject: [PATCH 3/3] Add llamaguard to config.py --- tests/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/config.py b/tests/config.py index aff1e2d0c1..0a71b79b5f 100644 --- a/tests/config.py +++ b/tests/config.py @@ -9,6 +9,7 @@ "codellama-34b-instruct-fp16": 3, "mistral-7b-instruct-fp16": 3, "mixtral-8x7b-instruct-fp16": 3, + "llamaguard-7b-fp16": 1, }, "triviaqa": { "llama-2-13b-chat-fp16": 2, @@ -20,5 +21,6 @@ "codellama-34b-instruct-fp16": 3, "mistral-7b-instruct-fp16": 3, "mixtral-8x7b-instruct-fp16": 3, + "llamaguard-7b-fp16": 3, }, }