-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtokencounter.py
27 lines (22 loc) · 896 Bytes
/
tokencounter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# requires the `transformers` package; please install with `pip -U install transformers`
import sys
from transformers import GPT2Tokenizer
def count_tokens(file_path):
print(f"Counting the token count estimate for: {inputfile} ...", flush=True)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
tokens = tokenizer.encode(text)
return len(tokens)
except Exception as e:
print(f"Error processing file: {e}")
return None
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: tokencounter.py <inputfile>")
sys.exit(1)
inputfile = sys.argv[1]
token_count = count_tokens(inputfile)
if token_count is not None:
print(f"Estimated token count: {token_count}")