From 5d878d776278097c45f8328a25bb993058e03ceb Mon Sep 17 00:00:00 2001 From: shrkvr2024 <146485007+shrkvr2024@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:25:13 +0330 Subject: [PATCH] Create tokenizer.py --- train/code/tokenizer.py | 87 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 train/code/tokenizer.py diff --git a/train/code/tokenizer.py b/train/code/tokenizer.py new file mode 100644 index 0000000..36763f9 --- /dev/null +++ b/train/code/tokenizer.py @@ -0,0 +1,87 @@ +import regex as re + +#raw string +mystring="هوشمند بیرانوند فروغی مظاهری زینلی معینی دولت رضاییان قادمه حیدرزاده کوچک سیاح حکیمی غلامزاده" +#g=[ord(x)for x in mystring] + + +#encoding raw txt with utf-8 encoding +tokens=list(mystring.encode("utf-8")) + +#gets the statistic of which pair appear togather more frequently +def get_stats(ids): + counts={} + for pair in zip(ids,ids[1:]): # (ids,ids[1:]) a way to make a silding window to comp 2 elements + counts[pair]=counts.get(pair,0)+1 + return counts +stats=get_stats(tokens) +#print(stats) +#print(sorted(((v,k) for k,v in stats.items()),reverse=True)) +top_pair=max(stats,key=stats.get) +#print(top_pair) + +#replaces the most common pair with a new id index or idx +def merge(ids,pair,idx): + newids=[] + i=0 + while i int or (child1,child2 ) turning into a new token +for i in range(num_merges): + stats=get_stats(ids) + pair=max(stats,key=stats.get) + idx=256+i + print(f"merging {pair} into a new token {idx}") + ids=merge(ids,pair,idx) + merges[pair]=idx + + +print("token length: ",len(tokens)) +print("ids length:",len(ids)) +print(f"compression ratio: {len(tokens)/len(ids):.2f}X") + + +#decoding + +# pre processing variable +vocab={idx:bytes([idx]) for idx in range(256)} +for (p0,p1),idx in merges.items(): + vocab[idx]=vocab[p0]+vocab[p1] #addition of two bytes object kinda of a concatination +def decoding(ids): + #given ids (list of ints) ,return python string\ + tokens= b"".join(vocab[idx] for idx in ids) + text=tokens.decode("utf-8",errors='replace') + return text + + #encoding segment + +def encoding(text): + tokens=list(text.encode("utf-8")) + while len(tokens)>=2: + stats=get_stats(tokens) + pair=min(stats, key=lambda p:merges.get(p,float("inf"))) + if pair not in merges: + break #nothing else is mergable + idx=merges[pair] + tokens=merge(tokens,pair,idx) + return tokens +f=encoding('حسینی زاده') +print(f) +print(decoding(f)) + + +##print(re.findall(gpt2pat,"heyo 123 123 I've come to you with big MASSIvE news "))