-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathllama2_test.py
231 lines (188 loc) · 10.7 KB
/
llama2_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Install instructions:
# !pip install pandas
# !pip install py-cpuinfo
# !pip instal llangchain
# !pip install prettytable
# !CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
Guides followed for this script:
- https://github.com/facebookresearch/llama-recipes/blob/main/demo_apps/HelloLlamaLocal.ipynb
- https://llama-cpp-python.readthedocs.io/en/latest/install/macos/
- https://python.langchain.com/docs/integrations/llms/llamacpp
"""
# Standard library imports
import argparse
from pathlib import Path
from timeit import default_timer as timer
# Third-party imports
import pandas as pd
from tqdm.auto import tqdm
from prettytable import PrettyTable # for nice looking results
# Local application/library specific imports
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
try:
import cpuinfo
CPU_PROCESSOR = cpuinfo.get_cpu_info().get('brand_raw').replace(" ", "_")
print(f"[INFO] Processor: {CPU_PROCESSOR}")
except Exception as e:
print(f"Error: {e}, may have failed to get CPU_PROCESSOR name from cpuinfo, please install cpuinfo or set CPU_PROCESSOR manually")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Llama 2 on a set of questions')
parser.add_argument('--path_to_gguf_model', default="./llama-2-7b-chat.Q4_0.gguf", type=str, help='Path to the Llama 2 model, see: https://huggingface.co/TheBloke for downloads, should be ".gguf" format')
parser.add_argument('--num_times_per_question', default=5, type=int, help='Number of times to ask each question')
parser.add_argument('--num_questions', default='all', type=str, help='Number of questions to ask, default "all", can be a positive integer between 1 and 20')
parser.add_argument('--max_tokens', default=500, type=int, help='Max tokens to generate per question, default 500')
parser.add_argument('--stream_output', action='store_true', help='Stream output token by token, may reduce speed')
args = parser.parse_args()
if args.stream_output == True:
print(f"[INFO] Streaming output set to: {args.stream_output}, this will print the output token by token, may reduce speed")
# Prompt questions for the model (using "Let's think step by step..." for verbosity of output)
# See "Let's think step by step..." paper: https://arxiv.org/abs/2205.11916
questions = [
"What are the nutrition facts of an apple? Let's think step by step...",
"What steps are involved in the water cycle? Let's think step by step...",
"How does a computer process a command? Let's think step by step...",
"What are the stages of a butterfly's life cycle? Let's think step by step...",
"How does a refrigerator keep food cold? Let's think step by step...",
"What happens when we digest food? Let's think step by step...",
"How does an airplane stay airborne? Let's think step by step...",
"What are the processes involved in making a cup of coffee? Let's think step by step...",
"How do bees produce honey? Let's think step by step...",
"What are the key steps in recycling plastic? Let's think step by step...",
"How does a clock measure time? Let's think step by step...",
"What is the process of photosynthesis in plants? Let's think step by step...",
"How does a car engine work? Let's think step by step...",
"What are the basic steps in baking bread? Let's think step by step...",
"How do solar panels generate electricity? Let's think step by step...",
"What are the stages of human sleep? Let's think step by step...",
"How does a smartphone connect to the internet? Let's think step by step...",
"What is the life cycle of a star? Let's think step by step...",
"How does the human immune system fight viruses? Let's think step by step...",
"What are the steps involved in creating a movie? Let's think step by step..."
]
def process_questions(arg, questions=questions):
if arg == "all":
print(f"[INFO] num_questions arg is 'all', will ask {len(questions)} questions...")
return questions
else:
arg = int(arg)
if isinstance(arg, int) and arg > 0:
# Make sure arg is not greater than the number of questions
if arg > len(questions):
print(f"[INFO] num_questions arg is '{arg}' & is greater than the number of questions '{len(questions)}', returning all questions...")
return questions
else:
print(f"[INFO] num_questions arg is '{arg}', will ask {arg} questions...")
return questions[:arg]
else:
raise ValueError("Argument must be 'all' or a positive integer between 1 and 20")
questions = process_questions(arg=args.num_questions)
### Model setup ###
# Set up your target model here, download from: https://huggingface.co/TheBloke, for macOS, you'll generally want "Q4_0.gguf" formatted models
path_to_gguf_model = args.path_to_gguf_model
# Make sure model path exists
assert Path(path_to_gguf_model).exists(), f"Model path '{path_to_gguf_model}' does not exist, please download it from Hugging Face and save it to the local directory, see: https://huggingface.co/TheBloke for '.gguf' models to run on macOS"
# Print model path
print(f"[INFO] Using model at path: {path_to_gguf_model}")
# For token-wise streaming so you'll see the answer gets generated token by token
# when Llama is answering your question
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# 1 GPU layer is enough for M-series chips
n_gpu_layers=1
llm = LlamaCpp(
model_path=path_to_gguf_model,
n_gpu_layers=n_gpu_layers,
temperature=0.5,
max_tokens=args.max_tokens,
n_batch=512,
top_p=1,
f16_kv=True,
n_ctx=2048, # context window
callback_manager=callback_manager if args.stream_output == True else None,
verbose=False, # this will print output, turning off to save printing to console (may reduce speed)
)
# Helper function for converting a string to token count
def character_count_to_tokens(sequence: str) -> float:
character_to_token_ratio = 4 # 4 chars = 1 token, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
# Get length of char_sequence without whitespace
character_len = len(sequence.strip())
# Return the token length based on char length
return character_len / character_to_token_ratio
### Ask questions ###
NUM_TIMES = args.num_times_per_question
TOTAL_QUESTIONS_TO_ASK = len(questions) * NUM_TIMES
if TOTAL_QUESTIONS_TO_ASK > 200:
print(f"[INFO] Asking {len(questions)} questions {NUM_TIMES} times each, total questions: {TOTAL_QUESTIONS_TO_ASK}")
# print a warning
print(f"[WARNING] Asking {TOTAL_QUESTIONS_TO_ASK} questions, this may take a while... (consider reducing the number of questions or number of times to ask each question)")
else:
print(f"[INFO] Asking {len(questions)} questions {NUM_TIMES} times each, total questions: {TOTAL_QUESTIONS_TO_ASK}")
# Prompt model X times per question
qa_results = []
for question in tqdm(questions):
print(f"[INFO] Asking question '{question}' {NUM_TIMES} times.")
for i, _ in enumerate(range(NUM_TIMES)):
start_time = timer()
answer = llm(question)
end_time = timer()
total_time = end_time - start_time
answer_char_len = len(answer.strip())
chars_per_second = round(answer_char_len / total_time, 2)
answer_token_len = character_count_to_tokens(sequence=answer)
tokens_per_second = round(answer_token_len / total_time, 2)
print(f"Answer char len: {answer_char_len} | Chars per second: {chars_per_second} | Answer token len: {answer_token_len} | Tokens per second: {tokens_per_second}")
qa_results.append({"question": question,
"question_iter": i,
"total_time": total_time,
"answer": answer,
"answer_char_len": answer_char_len,
"chars_per_second": chars_per_second,
"answer_token_len": answer_token_len,
"tokens_per_second": tokens_per_second})
### Save results to CSV ###
GPU_NAME = False
MODEL_NAME = path_to_gguf_model.replace('./', '')
if GPU_NAME:
csv_filename = f"{GPU_NAME}_{MODEL_NAME}_results.csv"
else:
csv_filename = f"{CPU_PROCESSOR}_{MODEL_NAME}_results.csv"
# Make the target results directory if it doesn't exist (include the parents)
target_results_dir = "results_llama2"
results_path = Path("results") / target_results_dir
results_path.mkdir(parents=True, exist_ok=True)
csv_filepath = results_path / csv_filename
# Turn dict into DataFrame
import pandas as pd
df = pd.DataFrame(qa_results)
# Print results
print(f"[INFO] Results on {CPU_PROCESSOR}:")
total_questions = len(df)
total_time_for_all_questions = round(df["total_time"].sum(), 2)
total_tokens_generated = df["answer_token_len"].sum()
total_chars_generated = df["answer_char_len"].sum()
total_tokens_per_second = round(total_tokens_generated / total_time_for_all_questions, 2)
total_chars_per_second = round(total_chars_generated / total_time_for_all_questions, 2)
# Create a PrettyTable object
table = PrettyTable()
# Define the columns
table.field_names = ["Metric", "Value"]
# Add rows
table.add_row(["Total questions", total_questions])
table.add_row(["Total time for all questions (s)", total_time_for_all_questions])
table.add_row(["Total tokens generated", total_tokens_generated])
table.add_row(["Total chars generated", total_chars_generated])
table.add_row(["Average tokens per second", total_tokens_per_second])
table.add_row(["Average chars per second", total_chars_per_second])
# Print the table
print(table)
# print(f"Total questions: {total_questions}")
# print(f"Total time for all questions: {total_time_for_all_questions}")
# print(f"Total tokens generated: {total_tokens_generated}")
# print(f"Total chars generated: {total_chars_generated}")
# print(f"Average tokens per second: {total_tokens_per_second}")
# print(f"Average chars per second: {total_chars_per_second}")
# Save to CSV
print(f"[INFO] Saving results to: {csv_filepath}")
df.to_csv(csv_filepath, index=False)