Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename endpoints accuracy tests and modify datasets #34

Open
wants to merge 3 commits into
base: develop-hf
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions lm_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from . import triviaqa_truncated_13b
from . import triviaqa_truncated_70b
from . import triviaqa_truncated_llama
from . import triviaqa_truncated_llamaguard
from . import triviaqa_truncated_codellama
from . import triviaqa_truncated_mistral
from . import triviaqa_truncated_mixtral
Expand Down Expand Up @@ -60,6 +61,7 @@
from . import gsm8k_truncated_13b
from . import gsm8k_truncated_70b
from . import gsm8k_truncated_llama
from . import gsm8k_truncated_llamaguard
from . import gsm8k_truncated_codellama
from . import gsm8k_truncated_mistral
from . import gsm8k_truncated_mixtral
Expand Down Expand Up @@ -148,10 +150,11 @@
"qa4mre_2012": qa4mre.QA4MRE_2012,
"qa4mre_2013": qa4mre.QA4MRE_2013,
"triviaqa": triviaqa.TriviaQA,
"triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA,
"triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA,
"triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA,
# "triviaqa_truncated_7b": triviaqa_truncated_7b.TruncatedTriviaQA,
# "triviaqa_truncated_13b": triviaqa_truncated_13b.TruncatedTriviaQA,
# "triviaqa_truncated_70b": triviaqa_truncated_70b.TruncatedTriviaQA,
"triviaqa_truncated_llama": triviaqa_truncated_llama.TruncatedTriviaQA,
"triviaqa_truncated_llamaguard": triviaqa_truncated_llamaguard.TruncatedTriviaQA,
"triviaqa_truncated_codellama": triviaqa_truncated_codellama.TruncatedTriviaQA,
"triviaqa_truncated_mistral": triviaqa_truncated_mistral.TruncatedTriviaQA,
"triviaqa_truncated_mixtral": triviaqa_truncated_mixtral.TruncatedTriviaQA,
Expand Down Expand Up @@ -196,10 +199,11 @@
"math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
"gsm8k": gsm8k.GradeSchoolMath8K,
"gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K,
# "gsm8k_truncated_7b": gsm8k_truncated_7b.TruncatedGradeSchoolMath8K,
# "gsm8k_truncated_13b": gsm8k_truncated_13b.TruncatedGradeSchoolMath8K,
# "gsm8k_truncated_70b": gsm8k_truncated_70b.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_llama": gsm8k_truncated_llama.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_llamaguard": gsm8k_truncated_llamaguard.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_codellama": gsm8k_truncated_codellama.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_mistral": gsm8k_truncated_mistral.TruncatedGradeSchoolMath8K,
"gsm8k_truncated_mixtral": gsm8k_truncated_mixtral.TruncatedGradeSchoolMath8K,
Expand Down
37 changes: 37 additions & 0 deletions lm_eval/tasks/gsm8k_truncated_llamaguard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import datasets
from . import gsm8k
from pathlib import Path

import os


class TruncatedGradeSchoolMath8K(gsm8k.GradeSchoolMath8K):
# Go up two directory levels
parent_dir = Path(__file__).parents[2]

# Define the path relative to the script location
relative_path = "tests/testdata/gsm8k_truncated_llamaguard.json"
DATASET_PATH = os.path.join(parent_dir, relative_path)

def has_training_docs(self):
return False

def has_test_docs(self):
return True

def training_docs(self):
return NotImplementedError

def test_docs(self):
return self.dataset

def download(self, data_dir=None, cache_dir=None, download_mode=None):
self.dataset = datasets.load_dataset(
"json",
data_files=self.DATASET_PATH,
data_dir=data_dir,
cache_dir=cache_dir,
split="train",
)

print(self.dataset)
39 changes: 39 additions & 0 deletions lm_eval/tasks/triviaqa_truncated_llamaguard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datasets
from . import triviaqa

import os


class TruncatedTriviaQA(triviaqa.TriviaQA):
# Get the directory where the script is located
script_directory = os.path.dirname(os.path.abspath(__file__))

# Go up two directory levels
parent_dir = os.path.join(script_directory, os.pardir, os.pardir)

# Define the path relative to the script location
relative_path = "tests/testdata/triviaqa_truncated_llamaguard.json"
DATASET_PATH = os.path.join(parent_dir, relative_path)

def has_training_docs(self):
return False

def has_test_docs(self):
return True

def training_docs(self):
return NotImplementedError

def test_docs(self):
return self.dataset

def download(self, data_dir=None, cache_dir=None, download_mode=None):
self.dataset = datasets.load_dataset(
"json",
data_files=self.DATASET_PATH,
data_dir=data_dir,
cache_dir=cache_dir,
split="train",
)

print(self.dataset)
20 changes: 11 additions & 9 deletions tests/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@
"gsm8k": {
"llama-2-13b-chat-fp16": 2,
"llama-2-70b-chat-int4": 3,
"llama-2-70b-chat-fp16": 4,
"llama-2-70b-chat-fp16": 3,
"codellama-7b-instruct-fp16": 1,
"codellama-13b-instruct-fp16": 2,
"codellama-34b-instruct-int4": 3,
"codellama-34b-instruct-fp16": 4,
"codellama-34b-instruct-fp16": 3,
"mistral-7b-instruct-fp16": 3,
"mixtral-8x7b-instruct-fp16": 3,
"llamaguard-7b-fp16": 1,
},
"triviaqa": {
"llama-2-13b-chat-fp16": 3,
"llama-2-70b-chat-int4": 4,
"llama-2-70b-chat-fp16": 5,
"codellama-7b-instruct-fp16": 2,
"codellama-13b-instruct-fp16": 3,
"codellama-34b-instruct-int4": 4,
"codellama-34b-instruct-fp16": 5,
"llama-2-13b-chat-fp16": 2,
"llama-2-70b-chat-int4": 3,
"llama-2-70b-chat-fp16": 4,
"codellama-7b-instruct-fp16": 1,
"codellama-13b-instruct-fp16": 2,
"codellama-34b-instruct-int4": 3,
"codellama-34b-instruct-fp16": 3,
"mistral-7b-instruct-fp16": 3,
"mixtral-8x7b-instruct-fp16": 3,
"llamaguard-7b-fp16": 3,
},
}
97 changes: 97 additions & 0 deletions tests/endpoints.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
{
"Dev": [
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-7b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-13b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-34b-instruct-int4",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-34b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-13b-chat-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-70b-chat-int4",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-70b-chat-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "mistral-7b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "mixtral-8x7b-instruct-fp16",
"context_size": 4096
}
],

"Prod": [
{
"url": "https://text.octoai.run",
"model": "codellama-7b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "codellama-13b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "codellama-34b-instruct-int4",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "codellama-34b-instruct-fp16",
"context_size": 16384
},
{
"url": "https://text.octoai.run",
"model": "llama-2-13b-chat-fp16",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "llama-2-70b-chat-int4",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "llama-2-70b-chat-fp16",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "mistral-7b-instruct-fp16",
"context_size": 4096
},
{
"url": "https://text.octoai.run",
"model": "mixtral-8x7b-instruct-fp16",
"context_size": 4096
}
]
}
57 changes: 5 additions & 52 deletions tests/run_docker_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,14 @@
import subprocess
import os

endpoints_data = {
"dev": [
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-7b-instruct-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-13b-instruct-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-34b-instruct-int4",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "codellama-34b-instruct-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-13b-chat-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-70b-chat-int4",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "llama-2-70b-chat-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "mistral-7b-instruct-fp16",
},
{
"url": "https://text.customer-endpoints.nimbus.octoml.ai",
"model": "mixstral-8x7b-instruct-fp16",
},
],
"prod": [
{"url": "https://text.octoai.run", "model": "codellama-7b-instruct-fp16"},
{"url": "https://text.octoai.run", "model": "codellama-13b-instruct-fp16"},
{"url": "https://text.octoai.run", "model": "codellama-34b-instruct-int4"},
{"url": "https://text.octoai.run", "model": "codellama-34b-instruct-fp16"},
{"url": "https://text.octoai.run", "model": "llama-2-13b-chat-fp16"},
{"url": "https://text.octoai.run", "model": "llama-2-70b-chat-int4"},
{"url": "https://text.octoai.run", "model": "llama-2-70b-chat-fp16"},
{"url": "https://text.octoai.run", "model": "mistral-7b-instruct-fp16"},
],
}
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'endpoints.json'), 'r') as f:
# Загружаем данные из файла
endpoints_data = json.load(f)

token = os.getenv("OCTOAI_TOKEN")
current_directory = os.getcwd()
path = "test_results"
prod = "prod"
prod = "Prod"

if not os.path.exists(os.path.join(current_directory, path)):
os.makedirs(path)
Expand All @@ -67,6 +20,6 @@
model_name = model_info["model"]
model_url = model_info["url"]

docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/unittest_endpoint.py -vv --model_name {model_name} --endpoint {model_url}"
docker_command = f"docker run -v {current_directory}/{path}:/lm_eval/test_results -e OCTOAI_TOKEN={token} daniilbarinov/lm-eval:1.0 pytest tests/smoke_accuracy_tests.py -vv --model_name {model_name} --endpoint {model_url}"

subprocess.run(docker_command, shell=True)
11 changes: 5 additions & 6 deletions tests/unittest_endpoint.py → tests/smoke_accuracy_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ def check_output(model_name, task_name, num_fewshot):
), f"Found the wrong answer or the incorrect scoring case:\nPredicted:\n{i['logit_0']}\nTruth:\n{i['truth']}"

result = (f"test_endpoint_{task_name}", model_name, "PASSED")
write_results_to_csv(result)
except AssertionError as e:
result = (f"test_endpoint_{task_name}", model_name, f"FAILED: {str(e)}")
write_results_to_csv(result)
raise e

write_results_to_csv(result)


@pytest.fixture
def model_name(request):
Expand All @@ -64,15 +64,14 @@ def test_endpoint_availability(model_name, endpoint, token):
]

try:
assert run_chat_completion(model_name, messages, token, endpoint) == 200

result = ("test_endpoint_availability", model_name, "PASSED")
assert run_chat_completion(model_name, messages, token, endpoint) == 200
write_results_to_csv(result)
except AssertionError as e:
result = ("test_endpoint_availability", model_name, f"FAILED: {str(e)}")
write_results_to_csv(result)
raise e

write_results_to_csv(result)


def test_endpoint_gsm8k(model_name, endpoint, token):
num_fewshot = 0
Expand Down