-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest.py
80 lines (67 loc) · 4.35 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import argparse
import pandas as pd
import os
import re
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from utils import create_file_name, extract_answer
from reader import read_dataset
def parse_arguments():
parser = argparse.ArgumentParser(description="LLM Dataset Evaluation Test Script for OpenAI Models.")
parser.add_argument("--datasets", nargs="+", default=["Insurance_Cost", "Admission_Chance", "Used_Car_Prices"], help="List of datasets to evaluate.")
parser.add_argument("--models", nargs="+", default=["gpt-4-0125-preview", "gpt-3.5-turbo-0125","meta/meta-llama-3-70b-instruct"], help="List of LLM models to use.")
parser.add_argument("--in-context-numbers", nargs="+", type=int, default=[0, 10, 30, 100], help="List of in-context example numbers to use.")
parser.add_argument("--feature-nums", nargs="+", type=int, default=[1, 2, 3, 4], help="List of feature numbers to use.")
parser.add_argument("--configs", nargs="+", default=["Named_Features", "Anonymized_Features", "Randomized_Ground_Truth", "Reasoning"], choices=["Named_Features", "Anonymized_Features", "Randomized_Ground_Truth", "Reasoning", "Missing_Inputs", "Missing_Inputs_and_Anonymized_Features"], help="List of prompt configurations to use.")
parser.add_argument("--input-folder", type=str, default="LLM_Results", help="The folder's name to read the LLM results.")
parser.add_argument("--output-folder", type=str, default="./", help="The output folder's name to save the outputs.")
parser.add_argument("--testing-sampling", type=int, default=0, help="A number assigned to the outputs as sampling.")
return parser.parse_args()
def evaluate_responses(dataset, model_name, in_context, feature_num, config, args):
file_name = create_file_name(args.input_folder, dataset, model_name, in_context, feature_num, config, testing_sampling=args.testing_sampling)
_, _, _, _, y_test = read_dataset(dataset, config)
df_test = pd.read_csv(file_name)
real_responses = []
predicted_responses = []
for index, row in df_test.iterrows():
predicted_response = extract_answer(str(row['processed_response']))
if predicted_response is not None and predicted_response != -1:
real_responses.append(y_test[index])
if dataset == "Admission_Chance" and (predicted_response > 1.0 or predicted_response < 0):
predicted_response = max(0, min(1, predicted_response))
predicted_responses.append(predicted_response)
mse = mean_squared_error(real_responses, predicted_responses)
mae = mean_absolute_error(real_responses, predicted_responses)
r2 = r2_score(real_responses, predicted_responses)
shorten_llm_names={"gpt-4-0125-preview":'GPT-4',"gpt-3.5-turbo-0125":'GPT-3',"meta/meta-llama-3-70b-instruct":'LLaMA 3'}.get
return {
"features": feature_num,
"dataset": dataset,
"model": shorten_llm_names(model_name),
"in_context": in_context,
"config": "Direct QA" if in_context==0 and config=="Named_Features" else config,
"MSE": float(mse),
"MAE": float(mae),
"r2": float(r2)
}
def main():
args = parse_arguments()
results = []
for dataset in args.datasets:
for model_name in args.models:
for in_context in args.in_context_numbers:
for feature_num in args.feature_nums:
for config in args.configs:
# Skip certain combinations based on experimental constraints
if (in_context == 0 and not (("Named_Features" in config) or ("Reasoning" in config))) or \
(config == "Reasoning" and in_context > 0) or \
(dataset == "Admission_Chance" and in_context > 101) or \
(feature_num == 4 and dataset != "Used_Car_Prices") or \
(feature_num == 4 and model_name == "meta/meta-llama-3-70b-instruct"):
continue
result = evaluate_responses(dataset, model_name, in_context, feature_num, config, args)
results.append(result)
print(f"Processed: {result}")
df = pd.DataFrame(results)
df.to_csv(os.path.join(args.output_folder, "evaluation_results.csv"), index=False)
if __name__ == "__main__":
main()