Skip to content

Commit

Permalink
Add special tokens for models
Browse files Browse the repository at this point in the history
We should download the relevant files from HF. I don't think we can
avoid implementing the Jinja2 templates for each model family though.
Would need to use regular expressions instead of full names (might be slow).
  • Loading branch information
rlouf committed Sep 27, 2024
1 parent 45b186d commit 148c274
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions prompts/tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from dataclasses import dataclass
from typing import Dict, Optional


@dataclass
class Limits:
begin: str = ""
end: str = ""


@dataclass
class Special:
sequence: Limits = Limits("", "")
user: Limits = Limits("", "")
assistant: Limits = Limits("", "")
system: Limits = Limits("", "")


SPECIAL_TOKENS: Dict[Optional[str], Special] = {
None: Special(),
"google/gemma-2-9b": Special(Limits("<bos>", "<eos>")),
"openai-community/gpt2": Special(Limits("", "<|endoftext|>")),
"mistralai/Mistral-7B-v0.1": Special(Limits("<s>", "</s>")),
"mistralai/Mistral-7B-Instruct-v0.1": Special(
Limits("<s>", "</s>"),
Limits("[INST]", "[/INST]"),
Limits("", "</s>"),
),
}

0 comments on commit 148c274

Please sign in to comment.