forked from MatthewCYM/VoiceBench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapi_judge.py
92 lines (73 loc) · 3.49 KB
/
api_judge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from tqdm import tqdm
import multiprocessing
import json
from openai import OpenAI
from src.api import generate_text_chat
from argparse import ArgumentParser
client = OpenAI()
meta_prompt_open = """
I need your help to evaluate the performance of several models in the speech interaction scenario. The models will receive a speech input from the user, which they need to understand and respond to with a speech output.
Your task is to rate the model’s responses based on the provided user input transcription [Instruction] and the model’s output transcription [Response].
Please evaluate the response on a scale of 1 to 5:
1 point: The response is largely irrelevant, incorrect, or fails to address the user’s query. It may be off-topic or provide incorrect information.
2 points: The response is somewhat relevant but lacks accuracy or completeness. It may only partially answer the user’s question or include extraneous information.
3 points: The response is relevant and mostly accurate, but it may lack conciseness or include unnecessary details that don’t contribute to the main point.
4 points: The response is relevant, accurate, and concise, providing a clear answer to the user’s question without unnecessary elaboration.
5 points: The response is exceptionally relevant, accurate, and to the point. It directly addresses the user’s query in a highly effective and efficient manner, providing exactly the information needed.
Below are the transcription of user’s instruction and models’ response:
### [Instruction]: {prompt}
### [Response]: {response}
After evaluating, please output the score only without anything else.
You don’t need to provide any explanations.
"""
meta_prompt_qa = """
### Question
{prompt}
### Reference answer
{reference}
### Candidate answer
{response}
Is the candidate answer correct based on the question and reference answer?
Please only output a single "Yes" or "No". Do not output anything else.
""".strip()
def generate(item):
if "reference" in item:
prompt = meta_prompt_qa.replace('{prompt}', item['prompt']).replace('{reference}', item['reference']).replace('{response}', item['response'])
else:
prompt = meta_prompt_open.replace("{prompt}", item['prompt']).replace('{response}', item['response'])
rtn = [
item.message.content.strip() for item in generate_text_chat(
client=client,
model='gpt-4o-mini',
messages=[{"role": "system",
"content": "You are a helpful assistant who tries to help answer the user's question."},
{"role": "user", "content": prompt}],
max_tokens=1024,
frequency_penalty=0,
presence_penalty=0,
stop=None,
temperature=0.5, top_p=0.95, n=3
).choices
]
item['score'] = rtn
return item
def main():
parser = ArgumentParser()
parser.add_argument('--src_file', required=True)
args = parser.parse_args()
# read source data
data = []
with open(args.src_file, 'r') as f:
for line in f:
json_obj = json.loads(line.strip()) # Convert JSON string to dictionary
data.append(json_obj)
# evaluate data
with multiprocessing.Pool(4) as pool:
scores = list(tqdm(pool.imap(generate, data), total=len(data)))
# save results
tgt_file = 'result-' + args.src_file
with open(tgt_file, "w") as file:
for d in scores:
file.write(json.dumps(d) + "\n")
if __name__ == '__main__':
main()