From 203a37bb3003fa109fa6588a6354609a00e278b9 Mon Sep 17 00:00:00 2001 From: Evgeny Nizhibitsky Date: Thu, 22 Feb 2024 13:35:41 +0000 Subject: [PATCH] Add ruff and pdm. Fix formatting and lint issues --- .gitignore | 3 +- .pdm-python | 1 + README.md | 2 +- exercise.md | 19 ++- minbpe/__init__.py | 4 +- minbpe/base.py | 35 +++-- minbpe/basic.py | 23 +-- minbpe/gpt4.py | 24 ++- minbpe/regex.py | 27 ++-- pdm.lock | 319 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 16 ++ requirements.txt | 2 - tests/test_tokenizer.py | 36 +++-- train.py | 4 +- 14 files changed, 441 insertions(+), 74 deletions(-) create mode 100644 .pdm-python create mode 100644 pdm.lock create mode 100644 pyproject.toml delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 50c957912..0be96963e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ __pycache__/ .DS_Store +.venv/ models/**/* *.pytest_cache *.model -*.vocab \ No newline at end of file +*.vocab diff --git a/.pdm-python b/.pdm-python new file mode 100644 index 000000000..e2e784da0 --- /dev/null +++ b/.pdm-python @@ -0,0 +1 @@ +.venv/bin/python diff --git a/README.md b/README.md index f3b5750f7..1e2609e5c 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ tokenizer.save("toy") # writes two files: toy.model (for loading) and toy.vocab (for viewing) ``` -According to Wikipedia, running bpe on the input string: "aaabdaaabac" for 3 merges results in the string: "XdXac" where X=ZY, Y=ab, and Z=aa. The tricky thing to note is that minbpe always allocates the 256 individual bytes as tokens, and then merges bytes as needed from there. So for us a=97, b=98, c=99, d=100 (their [ASCII](https://www.asciitable.com) values). Then when (a,a) is merged to Z, Z will become 256. Likewise Y will become 257 and X 258. So we start with the 256 bytes, and do 3 merges to get to the result above, with the expected output of [258, 100, 258, 97, 99]. +According to Wikipedia, running bpe on the input string: "aaabdaaabac" for 3 merges results in the string: "XdXac" where X=ZY, Y=ab, and Z=aa. The tricky thing to note is that minbpe always allocates the 256 individual bytes as tokens, and then merges bytes as needed from there. So for us a=97, b=98, c=99, d=100 (their [ASCII](https://www.asciitable.com) values). Then when (a,a) is merged to Z, Z will become 256. Likewise Y will become 257 and X 258. So we start with the 256 bytes, and do 3 merges to get to the result above, with the expected output of [258, 100, 258, 97, 99]. ## inference: GPT-4 comparison diff --git a/exercise.md b/exercise.md index 62be89ed0..f6471a6c9 100644 --- a/exercise.md +++ b/exercise.md @@ -1,8 +1,8 @@ -# exercise +# Exercise Build your own GPT-4 Tokenizer! -### Step 1 +## Step 1 Write the `BasicTokenizer` class, with the following three core functions: @@ -12,20 +12,19 @@ Write the `BasicTokenizer` class, with the following three core functions: Train your tokenizer on whatever text you like and visualize the merged tokens. Do they look reasonable? One default test you may wish to use is the text file `tests/taylorswift.txt`. -### Step 2 +## Step 2 Convert you `BasicTokenizer` into a `RegexTokenizer`, which takes a regex pattern and splits the text exactly as GPT-4 would. Process the parts separately as before, then concatenate the results. Retrain your tokenizer and compare the results before and after. You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace). Use the GPT-4 pattern: -``` +```python GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" ``` - -### Step 3 +## Step 3 You're now ready to load the merges from the GPT-4 tokenizer and show that your tokenizer produces the identical results for both `encode` and `decode`, matching [tiktoken](https://github.com/openai/tiktoken). -``` +```python # match this import tiktoken enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer @@ -38,11 +37,11 @@ Unfortunately, you will run into two issues: 1. It is not trivial to recover the raw merges from the GPT-4 tokenizer. You can easily recover what we call `vocab` here, and what they call and store under `enc._mergeable_ranks`. Feel free to copy paste the `recover_merges` function in `minbpe/gpt4.py`, which takes these ranks and returns the raw merges. If you wish to know how this function works, read [this](https://github.com/openai/tiktoken/issues/60) and [this](https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306). Basically, under some conditions it is enough to only store the parent nodes (and their rank) and get rid of the precise details of which children merged up to any parent. 2. Second, the GPT-4 tokenizer for some reason permutes its raw bytes. It stores this permutation in the first 256 elements of the mergeable ranks, so you can recover this byte shuffle relatively simply as `byte_shuffle = {i: enc._mergeable_ranks[bytes([i])] for i in range(256)}`. In both your encode and decode, you'll have to shuffle bytes around accordingly. If you're stuck, reference the minbpe/gpt4.py` file for hints. -### Step 4 +## Step 4 (Optional, irritating, not obviously useful) Add the ability to handle special tokens. You'll then be able to match the output of tiktoken even when special tokens are present, e.g.: -``` +```python import tiktoken enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer ids = enc.encode("<|endoftext|>hello world", allowed_special="all") @@ -50,6 +49,6 @@ ids = enc.encode("<|endoftext|>hello world", allowed_special="all") Without `allowed_special` tiktoken will error. -### Step 5 +## Step 5 If you've made it this far, you're now a pro at LLM Tokenization! Sadly, you're not exactly done yet because a lot of LLMs outside of OpenAI (e.g. Llama, Mistral) use [sentencepiece](https://github.com/google/sentencepiece) instead. Primary difference being that sentencepiece runs BPE directly on Unicode code points instead of on UTF-8 encoded bytes. Feel free to explore sentencepiece on your own (good luck, it's not too pretty), and stretch goal if you really experience and suffer from the burden of time, re-write your BPE to be on Unicode code points and match the Llama 2 tokenizer. diff --git a/minbpe/__init__.py b/minbpe/__init__.py index d659f5286..b729d546c 100644 --- a/minbpe/__init__.py +++ b/minbpe/__init__.py @@ -1,4 +1,6 @@ from .base import Tokenizer from .basic import BasicTokenizer -from .regex import RegexTokenizer from .gpt4 import GPT4Tokenizer +from .regex import RegexTokenizer + +__all__ = ["BasicTokenizer", "RegexTokenizer", "GPT4Tokenizer", "Tokenizer"] diff --git a/minbpe/base.py b/minbpe/base.py index 65cc45cf9..d263f94b4 100644 --- a/minbpe/base.py +++ b/minbpe/base.py @@ -10,6 +10,7 @@ # ----------------------------------------------------------------------------- # a few helper functions useful for both BasicTokenizer and RegexTokenizer + def get_stats(ids, counts=None): """ Given a list of integers, return a dictionary of counts of consecutive pairs @@ -17,7 +18,7 @@ def get_stats(ids, counts=None): Optionally allows to update an existing dictionary of counts """ counts = {} if counts is None else counts - for pair in zip(ids, ids[1:]): # iterate consecutive elements + for pair in zip(ids, ids[1:]): # iterate consecutive elements counts[pair] = counts.get(pair, 0) + 1 return counts @@ -28,17 +29,18 @@ def merge(ids, pair, idx): of pair with the new integer token idx Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4] """ - newids = [] + new_ids = [] i = 0 while i < len(ids): # if not at the very last position AND the pair matches, replace it - if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]: - newids.append(idx) + if ids[i] == pair[0] and i < len(ids) - 1 and ids[i + 1] == pair[1]: + new_ids.append(idx) i += 2 else: - newids.append(ids[i]) + new_ids.append(ids[i]) i += 1 - return newids + return new_ids + # first two helper functions... def replace_control_characters(s: str) -> str: @@ -49,29 +51,32 @@ def replace_control_characters(s: str) -> str: chars = [] for ch in s: if unicodedata.category(ch)[0] != "C": - chars.append(ch) # this character is ok + chars.append(ch) # this character is ok else: - chars.append(f"\\u{ord(ch):04x}") # escape + chars.append(f"\\u{ord(ch):04x}") # escape return "".join(chars) + def render_token(t: bytes) -> str: # pretty print a token, escaping control characters - s = t.decode('utf-8', errors='replace') + s = t.decode("utf-8", errors="replace") s = replace_control_characters(s) return s + # ----------------------------------------------------------------------------- # the base Tokenizer class + class Tokenizer: """Base class for Tokenizers""" def __init__(self): # default: vocab size of 256 (all bytes), no merges, no patterns - self.merges = {} # (int, int) -> int - self.pattern = "" # str - self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257} - self.vocab = self._build_vocab() # int -> bytes + self.merges = {} # (int, int) -> int + self.pattern = "" # str + self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257} + self.vocab = self._build_vocab() # int -> bytes def train(self, text, vocab_size, verbose=False): # Tokenizer can train a vocabulary of size vocab_size from text @@ -103,7 +108,7 @@ def save(self, file_prefix): """ # write the model: to be used in load() later model_file = file_prefix + ".model" - with open(model_file, 'w') as f: + with open(model_file, "w") as f: # write the version, pattern and merges, that's all that's needed f.write("minbpe v1\n") f.write(f"{self.pattern}\n") @@ -144,7 +149,7 @@ def load(self, model_file): merges = {} special_tokens = {} idx = 256 - with open(model_file, 'r', encoding="utf-8") as f: + with open(model_file, "r", encoding="utf-8") as f: # read the version version = f.readline().strip() assert version == "minbpe v1" diff --git a/minbpe/basic.py b/minbpe/basic.py index 9bc5ab76c..e08131f83 100644 --- a/minbpe/basic.py +++ b/minbpe/basic.py @@ -13,7 +13,6 @@ class BasicTokenizer(Tokenizer): - def __init__(self): super().__init__() @@ -22,12 +21,12 @@ def train(self, text, vocab_size, verbose=False): num_merges = vocab_size - 256 # input text preprocessing - text_bytes = text.encode("utf-8") # raw bytes - ids = list(text_bytes) # list of integers in range 0..255 + text_bytes = text.encode("utf-8") # raw bytes + ids = list(text_bytes) # list of integers in range 0..255 # iteratively merge the most common pairs to create new tokens - merges = {} # (int, int) -> int - vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes + merges = {} # (int, int) -> int + vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes for i in range(num_merges): # count up the number of times every consecutive pair appears stats = get_stats(ids) @@ -42,11 +41,13 @@ def train(self, text, vocab_size, verbose=False): vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # prints if verbose: - print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences") + print( + f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences" + ) # save class variables - self.merges = merges # used in encode() - self.vocab = vocab # used in decode() + self.merges = merges # used in encode() + self.vocab = vocab # used in decode() def decode(self, ids): # given ids (list of integers), return Python string @@ -56,8 +57,8 @@ def decode(self, ids): def encode(self, text): # given a string text, return the token ids - text_bytes = text.encode("utf-8") # raw bytes - ids = list(text_bytes) # list of integers in range 0..255 + text_bytes = text.encode("utf-8") # raw bytes + ids = list(text_bytes) # list of integers in range 0..255 while len(ids) >= 2: # find the pair with the lowest merge index stats = get_stats(ids) @@ -67,7 +68,7 @@ def encode(self, text): # just the first pair in the list, arbitrarily # we can detect this terminating case by a membership check if pair not in self.merges: - break # nothing else can be merged anymore + break # nothing else can be merged anymore # otherwise let's merge the best pair (lowest merge index) idx = self.merges[pair] ids = merge(ids, pair, idx) diff --git a/minbpe/gpt4.py b/minbpe/gpt4.py index fcc65500e..dc3db13bf 100644 --- a/minbpe/gpt4.py +++ b/minbpe/gpt4.py @@ -5,6 +5,7 @@ """ import tiktoken + from .regex import RegexTokenizer @@ -22,7 +23,11 @@ def bpe(mergeable_ranks, token, max_rank): if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + parts = ( + parts[:min_idx] + + [parts[min_idx] + parts[min_idx + 1]] + + parts[min_idx + 2 :] + ) return parts @@ -35,7 +40,7 @@ def recover_merges(mergeable_ranks): merges = {} for token, rank in mergeable_ranks.items(): if len(token) == 1: - continue # skip raw bytes + continue # skip raw bytes pair = tuple(bpe(mergeable_ranks, token, max_rank=rank)) assert len(pair) == 2 # recover the integer ranks of the pair @@ -45,15 +50,17 @@ def recover_merges(mergeable_ranks): return merges + GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" GPT4_SPECIAL_TOKENS = { - '<|endoftext|>': 100257, - '<|fim_prefix|>': 100258, - '<|fim_middle|>': 100259, - '<|fim_suffix|>': 100260, - '<|endofprompt|>': 100276 + "<|endoftext|>": 100257, + "<|fim_prefix|>": 100258, + "<|fim_middle|>": 100259, + "<|fim_suffix|>": 100260, + "<|endofprompt|>": 100276, } + class GPT4Tokenizer(RegexTokenizer): """Lightweight wrapper on RegexTokenizer that matches GPT-4's tokenizer.""" @@ -71,7 +78,7 @@ def __init__(self): self.vocab = vocab # now here is another tricky part. # for some reason, the tokens corresponding to individual bytes - # are permuted in a different order. This is completely non-sensical + # are permuted in a different order. This is completely nonsensical # and probably historical, but therefore we have to deal with it here. self.byte_shuffle = {i: mergeable_ranks[bytes([i])] for i in range(256)} self.inverse_byte_shuffle = {v: k for k, v in self.byte_shuffle.items()} @@ -112,6 +119,7 @@ def save_vocab(self, vocab_file): # simple run as: # python -c "from minbpe import GPT4Tokenizer; GPT4Tokenizer().save_vocab('gpt4.vocab')" from .base import render_token + # build vocab being mindful of the byte shuffle vocab = {idx: bytes([self.inverse_byte_shuffle[idx]]) for idx in range(256)} for (p0, p1), idx in self.merges.items(): diff --git a/minbpe/regex.py b/minbpe/regex.py index 9ed78e433..2f1b8c7d9 100644 --- a/minbpe/regex.py +++ b/minbpe/regex.py @@ -10,17 +10,18 @@ """ import regex as re -from .base import Tokenizer, get_stats, merge +from .base import Tokenizer, get_stats, merge # the main GPT text split patterns, see # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py -GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +GPT2_SPLIT_PATTERN = ( + r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +) GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" class RegexTokenizer(Tokenizer): - def __init__(self, pattern=None): """ - pattern: optional string to override the default (GPT-4 split pattern) @@ -44,8 +45,8 @@ def train(self, text, vocab_size, verbose=False): ids = [list(ch.encode("utf-8")) for ch in text_chunks] # iteratively merge the most common pairs to create new tokens - merges = {} # (int, int) -> int - vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes + merges = {} # (int, int) -> int + vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes for i in range(num_merges): # count the number of times every consecutive pair appears stats = {} @@ -63,11 +64,13 @@ def train(self, text, vocab_size, verbose=False): vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # prints if verbose: - print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences") + print( + f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences" + ) # save class variables - self.merges = merges # used in encode() - self.vocab = vocab # used in decode() + self.merges = merges # used in encode() + self.vocab = vocab # used in decode() def register_special_tokens(self, special_tokens): # special_tokens is a dictionary of str -> int @@ -102,7 +105,7 @@ def _encode_chunk(self, text_bytes): # just the first pair in the list, arbitrarily # we can detect this terminating case by a membership check if pair not in self.merges: - break # nothing else can be merged anymore + break # nothing else can be merged anymore # otherwise let's merge the best pair (lowest merge index) idx = self.merges[pair] ids = merge(ids, pair, idx) @@ -115,7 +118,7 @@ def encode_ordinary(self, text): # all chunks of text are encoded separately, then results are joined ids = [] for chunk in text_chunks: - chunk_bytes = chunk.encode("utf-8") # raw bytes + chunk_bytes = chunk.encode("utf-8") # raw bytes chunk_ids = self._encode_chunk(chunk_bytes) ids.extend(chunk_ids) return ids @@ -138,7 +141,9 @@ def encode(self, text, allowed_special="none_raise"): special = {} assert all(token not in text for token in self.special_tokens) elif isinstance(allowed_special, set): - special = {k: v for k, v in self.special_tokens.items() if k in allowed_special} + special = { + k: v for k, v in self.special_tokens.items() if k in allowed_special + } else: raise ValueError(f"allowed_special={allowed_special} not understood") if not special: diff --git a/pdm.lock b/pdm.lock new file mode 100644 index 000000000..700afbff6 --- /dev/null +++ b/pdm.lock @@ -0,0 +1,319 @@ +# This file is @generated by PDM. +# It is not intended for manual editing. + +[metadata] +groups = ["default", "dev"] +strategy = ["cross_platform", "inherit_metadata"] +lock_version = "4.4.1" +content_hash = "sha256:84f7d834d48153408bcb97c0c7fbcf633be5c07727c4314e8403b19a2d25f61c" + +[[package]] +name = "certifi" +version = "2024.2.2" +requires_python = ">=3.6" +summary = "Python package for providing Mozilla's CA Bundle." +groups = ["default"] +files = [ + {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, + {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +requires_python = ">=3.7.0" +summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +groups = ["default"] +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +summary = "Cross-platform colored terminal text." +groups = ["dev"] +marker = "sys_platform == \"win32\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.0" +requires_python = ">=3.7" +summary = "Backport of PEP 654 (exception groups)" +groups = ["dev"] +marker = "python_version < \"3.11\"" +files = [ + {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, + {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, +] + +[[package]] +name = "idna" +version = "3.6" +requires_python = ">=3.5" +summary = "Internationalized Domain Names in Applications (IDNA)" +groups = ["default"] +files = [ + {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, + {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +requires_python = ">=3.7" +summary = "brain-dead simple config-ini parsing" +groups = ["dev"] +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "packaging" +version = "23.2" +requires_python = ">=3.7" +summary = "Core utilities for Python packages" +groups = ["dev"] +files = [ + {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, + {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, +] + +[[package]] +name = "pluggy" +version = "1.4.0" +requires_python = ">=3.8" +summary = "plugin and hook calling mechanisms for python" +groups = ["dev"] +files = [ + {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, + {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, +] + +[[package]] +name = "pytest" +version = "8.0.1" +requires_python = ">=3.8" +summary = "pytest: simple powerful testing with Python" +groups = ["dev"] +dependencies = [ + "colorama; sys_platform == \"win32\"", + "exceptiongroup>=1.0.0rc8; python_version < \"3.11\"", + "iniconfig", + "packaging", + "pluggy<2.0,>=1.3.0", + "tomli>=1.0.0; python_version < \"3.11\"", +] +files = [ + {file = "pytest-8.0.1-py3-none-any.whl", hash = "sha256:3e4f16fe1c0a9dc9d9389161c127c3edc5d810c38d6793042fb81d9f48a59fca"}, + {file = "pytest-8.0.1.tar.gz", hash = "sha256:267f6563751877d772019b13aacbe4e860d73fe8f651f28112e9ac37de7513ae"}, +] + +[[package]] +name = "regex" +version = "2023.12.25" +requires_python = ">=3.7" +summary = "Alternative regular expression module, to replace re." +groups = ["default"] +files = [ + {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"}, + {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"}, + {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"}, + {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"}, + {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"}, + {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"}, + {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"}, + {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"}, + {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"}, + {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"}, +] + +[[package]] +name = "requests" +version = "2.31.0" +requires_python = ">=3.7" +summary = "Python HTTP for Humans." +groups = ["default"] +dependencies = [ + "certifi>=2017.4.17", + "charset-normalizer<4,>=2", + "idna<4,>=2.5", + "urllib3<3,>=1.21.1", +] +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[[package]] +name = "ruff" +version = "0.2.2" +requires_python = ">=3.7" +summary = "An extremely fast Python linter and code formatter, written in Rust." +groups = ["dev"] +files = [ + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, + {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, + {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, + {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, + {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, +] + +[[package]] +name = "tiktoken" +version = "0.6.0" +requires_python = ">=3.8" +summary = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" +groups = ["default"] +dependencies = [ + "regex>=2022.1.18", + "requests>=2.26.0", +] +files = [ + {file = "tiktoken-0.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:277de84ccd8fa12730a6b4067456e5cf72fef6300bea61d506c09e45658d41ac"}, + {file = "tiktoken-0.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c44433f658064463650d61387623735641dcc4b6c999ca30bc0f8ba3fccaf5c"}, + {file = "tiktoken-0.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afb9a2a866ae6eef1995ab656744287a5ac95acc7e0491c33fad54d053288ad3"}, + {file = "tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62c05b3109fefca26fedb2820452a050074ad8e5ad9803f4652977778177d9f"}, + {file = "tiktoken-0.6.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0ef917fad0bccda07bfbad835525bbed5f3ab97a8a3e66526e48cdc3e7beacf7"}, + {file = "tiktoken-0.6.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e095131ab6092d0769a2fda85aa260c7c383072daec599ba9d8b149d2a3f4d8b"}, + {file = "tiktoken-0.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:05b344c61779f815038292a19a0c6eb7098b63c8f865ff205abb9ea1b656030e"}, + {file = "tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cefb9870fb55dca9e450e54dbf61f904aab9180ff6fe568b61f4db9564e78871"}, + {file = "tiktoken-0.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:702950d33d8cabc039845674107d2e6dcabbbb0990ef350f640661368df481bb"}, + {file = "tiktoken-0.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8d49d076058f23254f2aff9af603863c5c5f9ab095bc896bceed04f8f0b013a"}, + {file = "tiktoken-0.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:430bc4e650a2d23a789dc2cdca3b9e5e7eb3cd3935168d97d43518cbb1f9a911"}, + {file = "tiktoken-0.6.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:293cb8669757301a3019a12d6770bd55bec38a4d3ee9978ddbe599d68976aca7"}, + {file = "tiktoken-0.6.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7bd1a288b7903aadc054b0e16ea78e3171f70b670e7372432298c686ebf9dd47"}, + {file = "tiktoken-0.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac76e000183e3b749634968a45c7169b351e99936ef46f0d2353cd0d46c3118d"}, + {file = "tiktoken-0.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:17cc8a4a3245ab7d935c83a2db6bb71619099d7284b884f4b2aea4c74f2f83e3"}, + {file = "tiktoken-0.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:284aebcccffe1bba0d6571651317df6a5b376ff6cfed5aeb800c55df44c78177"}, + {file = "tiktoken-0.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c1a3a5d33846f8cd9dd3b7897c1d45722f48625a587f8e6f3d3e85080559be8"}, + {file = "tiktoken-0.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6318b2bb2337f38ee954fd5efa82632c6e5ced1d52a671370fa4b2eff1355e91"}, + {file = "tiktoken-0.6.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f5f0f2ed67ba16373f9a6013b68da298096b27cd4e1cf276d2d3868b5c7efd1"}, + {file = "tiktoken-0.6.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:75af4c0b16609c2ad02581f3cdcd1fb698c7565091370bf6c0cf8624ffaba6dc"}, + {file = "tiktoken-0.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:45577faf9a9d383b8fd683e313cf6df88b6076c034f0a16da243bb1c139340c3"}, + {file = "tiktoken-0.6.0.tar.gz", hash = "sha256:ace62a4ede83c75b0374a2ddfa4b76903cf483e9cb06247f566be3bf14e6beed"}, +] + +[[package]] +name = "tomli" +version = "2.0.1" +requires_python = ">=3.7" +summary = "A lil' TOML parser" +groups = ["dev"] +marker = "python_version < \"3.11\"" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "urllib3" +version = "2.2.1" +requires_python = ">=3.8" +summary = "HTTP library with thread-safe connection pooling, file post, and more." +groups = ["default"] +files = [ + {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, + {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..c8a26e397 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "minbpe" +version = "0.1.0" +description = "Minimal, clean code for the Byte Pair Encoding (BPE) algorithm" +authors = [{ name = "Andrej" }] +dependencies = ["regex==2023.12.25", "tiktoken==0.6.0"] +requires-python = ">=3.10" +readme = "README.md" +license = { text = "MIT" } + + +[tool.pdm] +distribution = false + +[tool.pdm.dev-dependencies] +dev = ["ruff>=0.2.2", "pytest>=8.0.1"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c46ba631a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -regex -tiktoken \ No newline at end of file diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 6dda21237..903652f62 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,19 +1,22 @@ +import os + import pytest import tiktoken -import os -from minbpe import BasicTokenizer, RegexTokenizer, GPT4Tokenizer +from minbpe import BasicTokenizer, GPT4Tokenizer, RegexTokenizer # ----------------------------------------------------------------------------- # common test data # a few strings to test the tokenizers on test_strings = [ - "", # empty string - "?", # single character - "hello world!!!? (안녕하세요!) lol123 😉", # fun small string - "FILE:taylorswift.txt", # FILE: is handled as a special string in unpack() + "", # empty string + "?", # single character + "hello world!!!? (안녕하세요!) lol123 😉", # fun small string + "FILE:taylorswift.txt", # FILE: is handled as a special string in unpack() ] + + def unpack(text): # we do this because `pytest -v .` prints the arguments to console, and we don't # want to print the entire contents of the file, it creates a mess. So here we go. @@ -25,6 +28,7 @@ def unpack(text): else: return text + specials_string = """ <|endoftext|>Hello world this is one document <|endoftext|>And this is another document @@ -32,11 +36,11 @@ def unpack(text): <|endoftext|>Last document!!! 👋<|endofprompt|> """.strip() special_tokens = { - '<|endoftext|>': 100257, - '<|fim_prefix|>': 100258, - '<|fim_middle|>': 100259, - '<|fim_suffix|>': 100260, - '<|endofprompt|>': 100276 + "<|endoftext|>": 100257, + "<|fim_prefix|>": 100258, + "<|fim_middle|>": 100259, + "<|fim_suffix|>": 100260, + "<|endofprompt|>": 100276, } llama_text = """ <|endoftext|>The llama (/ˈlɑːmə/; Spanish pronunciation: [ˈʎama] or [ˈʝama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. @@ -48,8 +52,11 @@ def unpack(text): # ----------------------------------------------------------------------------- # tests + # test encode/decode identity for a few different strings -@pytest.mark.parametrize("tokenizer_factory", [BasicTokenizer, RegexTokenizer, GPT4Tokenizer]) +@pytest.mark.parametrize( + "tokenizer_factory", [BasicTokenizer, RegexTokenizer, GPT4Tokenizer] +) @pytest.mark.parametrize("text", test_strings) def test_encode_decode_identity(tokenizer_factory, text): text = unpack(text) @@ -58,6 +65,7 @@ def test_encode_decode_identity(tokenizer_factory, text): decoded = tokenizer.decode(ids) assert text == decoded + # test that our tokenizer matches the official GPT-4 tokenizer @pytest.mark.parametrize("text", test_strings) def test_gpt4_tiktoken_equality(text): @@ -68,6 +76,7 @@ def test_gpt4_tiktoken_equality(text): gpt4_tokenizer_ids = tokenizer.encode(text) assert gpt4_tokenizer_ids == tiktoken_ids + # test the handling of special tokens def test_gpt4_tiktoken_equality_special_tokens(): tokenizer = GPT4Tokenizer() @@ -76,6 +85,7 @@ def test_gpt4_tiktoken_equality_special_tokens(): gpt4_tokenizer_ids = tokenizer.encode(specials_string, allowed_special="all") assert gpt4_tokenizer_ids == tiktoken_ids + # reference test to add more tests in the future @pytest.mark.parametrize("tokenizer_factory", [BasicTokenizer, RegexTokenizer]) def test_wikipedia_example(tokenizer_factory): @@ -106,6 +116,7 @@ def test_wikipedia_example(tokenizer_factory): assert ids == [258, 100, 258, 97, 99] assert tokenizer.decode(tokenizer.encode(text)) == text + @pytest.mark.parametrize("special_tokens", [{}, special_tokens]) def test_save_load(special_tokens): # take a bit more complex piece of text and train the tokenizer, chosen at random @@ -131,5 +142,6 @@ def test_save_load(special_tokens): for file in ["test_tokenizer_tmp.model", "test_tokenizer_tmp.vocab"]: os.remove(file) + if __name__ == "__main__": pytest.main() diff --git a/train.py b/train.py index b2768d3ed..c54008164 100644 --- a/train.py +++ b/train.py @@ -5,6 +5,7 @@ import os import time + from minbpe import BasicTokenizer, RegexTokenizer # open some text and train a vocab of 512 tokens @@ -15,7 +16,6 @@ t0 = time.time() for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]): - # construct the Tokenizer object and kick off verbose training tokenizer = TokenizerClass() tokenizer.train(text, 512, verbose=True) @@ -24,4 +24,4 @@ tokenizer.save(prefix) t1 = time.time() -print(f"Training took {t1 - t0:.2f} seconds") \ No newline at end of file +print(f"Training took {t1 - t0:.2f} seconds")