diff --git a/.github/workflows/workflow-pr-gate.yml b/.github/workflows/workflow-pr-gate.yml
index 207bb3014..b500f3ddc 100644
--- a/.github/workflows/workflow-pr-gate.yml
+++ b/.github/workflows/workflow-pr-gate.yml
@@ -134,6 +134,7 @@ jobs:
     needs:
       - unit-tests-linux-python-other
       - unit-tests-gpu-python-latest
+      - server-tests
     name: End Stage 2
     runs-on: ubuntu-latest
     steps:
diff --git a/.gitignore b/.gitignore
index b53296e12..c590bc397 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,6 @@ guidance/_rust/Cargo.lock
 
 notebooks/**/*.papermill_out.ipynb
 
-.mypy_cache/*
\ No newline at end of file
+.mypy_cache/*
+
+**/scratch.*
\ No newline at end of file
diff --git a/README.md b/README.md
index c4d9f7efe..b0f6c3017 100644
--- a/README.md
+++ b/README.md
@@ -55,8 +55,8 @@ else:
 ```python
 from guidance import user, assistant
 
-# load a chat model
-chat_lm = models.LlamaCppChat(path)
+# load a model
+chat_lm = models.LlamaCpp(path)
 
 # wrap with chat block contexts
 with user():
diff --git a/guidance/_chat.py b/guidance/_chat.py
new file mode 100644
index 000000000..7790af37d
--- /dev/null
+++ b/guidance/_chat.py
@@ -0,0 +1,210 @@
+import warnings
+import uuid
+import inspect
+
+class ChatTemplate:
+    """Contains template for all chat and instruct tuned models."""
+
+    def get_role_start(self, role_name, **kwargs):
+        raise NotImplementedError(
+            "You need to use a ChatTemplate subclass that overrides the get_role_start method"
+        )
+
+    def get_role_end(self, role_name=None):
+        raise NotImplementedError(
+            "You need to use a ChatTemplate subclass that overrides the get_role_start method"
+        )
+    
+class ChatTemplateCache: 
+    def __init__(self):
+        self._cache = {}
+
+    def __getitem__(self, key):
+        key_compact = key.replace(" ", "")
+        return self._cache[key_compact]
+
+
+    def __setitem__(self, key, value):
+        key_compact = key.replace(" ", "")
+        self._cache[key_compact] = value
+
+    def __contains__(self, key):
+        key_compact = key.replace(" ", "")
+        return key_compact in self._cache
+    
+# Feels weird having to instantiate this, but it's a singleton for all purposes
+# TODO [HN]: Add an alias system so we can instantiate with other simple keys (e.g. "llama2" instead of the full template string)
+CHAT_TEMPLATE_CACHE = ChatTemplateCache() 
+
+class UnsupportedRoleException(Exception):
+    def __init__(self, role_name, instance):
+        self.role_name = role_name
+        self.instance = instance
+        super().__init__(self._format_message())
+
+    def _format_message(self):
+        return (f"Role {self.role_name} is not supported by the {self.instance.__class__.__name__} chat template. ")
+
+def load_template_class(chat_template=None):
+    """Utility method to find the best chat template.
+    
+    Order of precedence:
+    - If it's a chat template class, use it directly
+    - If it's a string, check the cache of popular model templates
+    - If it's a string and not in the cache, try to create a class dynamically
+    - [TODO] If it's a string and can't be created, default to ChatML and raise a warning
+    - If it's None, default to ChatML and raise a warning
+    """
+    if inspect.isclass(chat_template) and issubclass(chat_template, ChatTemplate):
+        if chat_template is ChatTemplate:
+            raise Exception("You can't use the base ChatTemplate class directly. Create or use a subclass instead.")
+        return chat_template
+    
+    elif isinstance(chat_template, str):
+        # First check the cache of popular model types
+        # TODO: Expand keys of cache to include aliases for popular model types (e.g. "llama2, phi3")
+        # Can possibly accomplish this with an "aliases" dictionary that maps all aliases to the canonical key in cache
+        if chat_template in CHAT_TEMPLATE_CACHE:
+            return CHAT_TEMPLATE_CACHE[chat_template]
+        # TODO: Add logic here to try to auto-create class dynamically via _template_class_from_string method
+    
+    # Only warn when a user provided a chat template that we couldn't load
+    if chat_template is not None:
+        warnings.warn(f"""Chat template {chat_template} was unable to be loaded directly into guidance.
+                        Defaulting to the ChatML format which may not be optimal for the selected model. 
+                        For best results, create and pass in a `guidance.ChatTemplate` subclass for your model.""")
+    
+    # By default, use the ChatML Template. Warnings to user will happen downstream only if they use chat roles.
+    return ChatMLTemplate
+        
+
+def _template_class_from_string(template_str):
+    """Utility method to try to create a chat template class from a string."""
+    # TODO: Try to build this, perhaps based on passing unit tests we create?
+    pass
+
+
+# CACHE IMPLEMENTATIONS:
+
+# --------------------------------------------------
+# @@@@ ChatML @@@@
+# --------------------------------------------------
+# Note that all grammarless models will default to this syntax, since we typically send chat formatted messages.
+chatml_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
+class ChatMLTemplate(ChatTemplate):
+    template_str = chatml_template
+
+    def get_role_start(self, role_name):
+        return f"<|im_start|>{role_name}\n"
+        
+    def get_role_end(self, role_name=None):
+        return "<|im_end|>\n"
+    
+CHAT_TEMPLATE_CACHE[chatml_template] = ChatMLTemplate
+
+
+# --------------------------------------------------
+# @@@@ Llama-2 @@@@
+# --------------------------------------------------
+# [05/08/24] https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer_config.json#L12
+llama2_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+class Llama2ChatTemplate(ChatTemplate):
+    # available_roles = ["system", "user", "assistant"]
+    template_str = llama2_template
+
+    def get_role_start(self, role_name):
+        if role_name == "system":
+            return "[INST] <<SYS>>\n"
+        elif role_name == "user":
+            return "<s>[INST]"
+        elif role_name == "assistant":
+            return " "
+        else:
+            raise UnsupportedRoleException(role_name, self)
+        
+    def get_role_end(self, role_name=None):
+        if role_name == "system":
+            return "\n<</SYS>"
+        elif role_name == "user":
+            return " [/INST]"
+        elif role_name == "assistant":
+            return "</s>"
+        else:
+            raise UnsupportedRoleException(role_name, self)
+
+CHAT_TEMPLATE_CACHE[llama2_template] = Llama2ChatTemplate
+
+
+# --------------------------------------------------
+# @@@@ Llama-3 @@@@
+# --------------------------------------------------
+# [05/08/24] https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json#L2053
+llama3_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+class Llama3ChatTemplate(ChatTemplate):
+    # available_roles = ["system", "user", "assistant"]
+    template_str = llama3_template
+
+    def get_role_start(self, role_name):
+        if role_name == "system":
+            return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+        elif role_name == "user":
+            return "<|start_header_id|>user<|end_header_id>\n\n"
+        elif role_name == "assistant":
+            return "<|start_header_id|>assistant<|end_header_id>\n\n"
+        else:
+            raise UnsupportedRoleException(role_name, self)
+        
+    def get_role_end(self, role_name=None):
+        return "<|eot_id|>"
+    
+CHAT_TEMPLATE_CACHE[llama3_template] = Llama3ChatTemplate
+
+# --------------------------------------------------
+# @@@@ Phi-3 @@@@
+# --------------------------------------------------
+# [05/08/24] https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/tokenizer_config.json#L119
+phi3_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+class Phi3ChatTemplate(ChatTemplate):
+    # available_roles = ["user", "assistant"]
+    template_str = phi3_template
+
+    def get_role_start(self, role_name):
+        if role_name == "user":
+            return "<|user|>"
+        elif role_name == "assistant":
+            return "<|assistant|>"
+        else:
+            raise UnsupportedRoleException(role_name, self)
+        
+    def get_role_end(self, role_name=None):
+        return "<|end|>"
+    
+CHAT_TEMPLATE_CACHE[phi3_template] = Phi3ChatTemplate
+
+
+# --------------------------------------------------
+# @@@@ Mistral-7B-Instruct-v0.2 @@@@
+# --------------------------------------------------
+# [05/08/24] https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/tokenizer_config.json#L42
+mistral_7b_instruct_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+class Mistral7BInstructChatTemplate(ChatTemplate):
+    # available_roles = ["user", "assistant"]
+    template_str = mistral_7b_instruct_template
+
+    def get_role_start(self, role_name):
+        if role_name == "user":
+            return "[INST] "
+        elif role_name == "assistant":
+            return ""
+        else:
+            raise UnsupportedRoleException(role_name, self)
+        
+    def get_role_end(self, role_name=None):
+        if role_name == "user":
+            return " [/INST]"
+        elif role_name == "assistant":
+            return "</s>"
+        else:
+            raise UnsupportedRoleException(role_name, self)
+        
+CHAT_TEMPLATE_CACHE[mistral_7b_instruct_template] = Mistral7BInstructChatTemplate
\ No newline at end of file
diff --git a/guidance/library/_role.py b/guidance/library/_role.py
index 122caa523..fa8859986 100644
--- a/guidance/library/_role.py
+++ b/guidance/library/_role.py
@@ -7,14 +7,10 @@
 span_start = "<||_html:<span style='background-color: rgba(255, 180, 0, 0.3); border-radius: 3px;'>_||>"
 span_end = "<||_html:</span>_||>"
 
-
 @guidance
 def role_opener(lm, role_name, **kwargs):
     indent = getattr(lm, "indent_roles", True)
-    if not hasattr(lm, "get_role_start"):
-        raise Exception(
-            f"You need to use a chat model in order the use role blocks like `with {role_name}():`! Perhaps you meant to use the {type(lm).__name__}Chat class?"
-        )
+
 
     # Block start container (centers elements)
     if indent:
@@ -25,8 +21,17 @@ def role_opener(lm, role_name, **kwargs):
         lm += nodisp_start
     else:
         lm += span_start
-
-    lm += lm.get_role_start(role_name, **kwargs)
+    
+    # TODO [HN]: Temporary change while I instrument chat_template in transformers only.
+    # Eventually have all models use chat_template.
+    if hasattr(lm, "get_role_start"):
+        lm += lm.get_role_start(role_name, **kwargs)
+    elif hasattr(lm, "chat_template"):
+        lm += lm.chat_template.get_role_start(role_name)
+    else:
+        raise Exception(
+            f"You need to use a chat model in order the use role blocks like `with {role_name}():`! Perhaps you meant to use the {type(lm).__name__}Chat class?"
+        )
 
     # End of either debug or HTML no disp block
     if indent:
@@ -46,7 +51,12 @@ def role_closer(lm, role_name, **kwargs):
     else:
         lm += span_start
 
-    lm += lm.get_role_end(role_name)
+    # TODO [HN]: Temporary change while I instrument chat_template in transformers only.
+    # Eventually have all models use chat_template.
+    if hasattr(lm, "get_role_end"):
+        lm += lm.get_role_end(role_name)
+    elif hasattr(lm, "chat_template"):
+        lm += lm.chat_template.get_role_end(role_name)
 
     # End of either debug or HTML no disp block
     if indent:
@@ -60,7 +70,7 @@ def role_closer(lm, role_name, **kwargs):
 
     return lm
 
-
+# TODO HN: Add a docstring to better describe arbitrary role functions
 def role(role_name, text=None, **kwargs):
     if text is None:
         return block(
diff --git a/guidance/models/__init__.py b/guidance/models/__init__.py
index c1d2e6767..41391c488 100644
--- a/guidance/models/__init__.py
+++ b/guidance/models/__init__.py
@@ -1,8 +1,8 @@
 from ._model import Model, Instruct, Chat
 
 # local models
-from .transformers._transformers import Transformers, TransformersChat
-from .llama_cpp import LlamaCpp, LlamaCppChat, MistralInstruct, MistralChat
+from .transformers._transformers import Transformers
+from .llama_cpp import LlamaCpp
 from ._mock import Mock, MockChat
 
 # grammarless models (we can't do constrained decoding for them)
@@ -15,15 +15,12 @@
 )
 from ._azure_openai import (
     AzureOpenAI,
-    AzureOpenAIChat,
-    AzureOpenAICompletion,
-    AzureOpenAIInstruct,
 )
 from ._azureai_studio import AzureAIStudioChat
-from ._openai import OpenAI, OpenAIChat, OpenAIInstruct, OpenAICompletion
+from ._openai import OpenAI
 from ._lite_llm import LiteLLM, LiteLLMChat, LiteLLMInstruct, LiteLLMCompletion
 from ._cohere import Cohere, CohereCompletion, CohereInstruct
-from ._anthropic import Anthropic, AnthropicChat
+from ._anthropic import Anthropic
 from ._googleai import GoogleAI, GoogleAIChat
 from ._togetherai import (
     TogetherAI,
diff --git a/guidance/models/_anthropic.py b/guidance/models/_anthropic.py
index a1eba1c29..334d86330 100644
--- a/guidance/models/_anthropic.py
+++ b/guidance/models/_anthropic.py
@@ -23,12 +23,6 @@ def __init__(
                 "Please install the anthropic package version >= 0.7 using `pip install anthropic -U` in order to use guidance.models.Anthropic!"
             )
 
-        # if we are called directly (as opposed to through super()) then we convert ourselves to a more specific subclass if possible
-        if self.__class__ is Anthropic:
-            raise Exception(
-                "The Anthropic class is not meant to be used directly! Please use AnthropicChat assuming the model you are using is chat-based."
-            )
-
         if api_key is None:
             api_key = os.environ.get("ANTHROPIC_API_KEY")
 
@@ -38,7 +32,6 @@ def __init__(
             )
 
         self.anthropic = Anthropic(api_key=api_key, **kwargs)
-
         self.model_name = model
 
         # we pretend it tokenizes like gpt2 if tiktoken does not know about it... TODO: make this better
@@ -52,24 +45,73 @@ def __init__(
 
     def _generator(self, prompt, temperature):
 
+        # find the role tags
+        pos = 0
+        role_end = b"<|im_end|>\n"
+        messages = []
+        found = True
+        system_prompt = None # Not mandatory, but we'll store it if found
+        while found:
+
+            # find the role text blocks
+            found = False
+            for role_name, start_bytes in (
+                ("system", b"<|im_start|>system\n"),
+                ("user", b"<|im_start|>user\n"),
+                ("assistant", b"<|im_start|>assistant\n"),
+            ):
+                if prompt[pos:].startswith(start_bytes):
+                    pos += len(start_bytes)
+                    end_pos = prompt[pos:].find(role_end)
+                    if end_pos < 0:
+                        assert (
+                            role_name == "assistant"
+                        ), "Bad chat format! Last role before gen needs to be assistant!"
+                        break
+                    btext = prompt[pos : pos + end_pos]
+                    pos += end_pos + len(role_end)
+                    if role_name == "system":
+                        system_prompt = btext.decode("utf8")
+                    else:
+                        messages.append(
+                            {"role": role_name, "content": btext.decode("utf8")}
+                        )
+                    found = True
+                    break
+
+        # Add nice exception if no role tags were used in the prompt.
+        # TODO: Move this somewhere more general for all chat models?
+        if messages == []:
+            raise ValueError(
+                f"The AnthropicAI model {self.model_name} is a Chat-based model and requires role tags in the prompt! \
+            Make sure you are using guidance context managers like `with system():`, `with user():` and `with assistant():` \
+            to appropriately format your guidance program for this type of model."
+            )
+
         # update our shared data state
         self._reset_shared_data(prompt, temperature)
 
+        # API call and response handling
         try:
-            generator = self.anthropic.completions.create(
+            # Need to do this because Anthropic API is a bit weird with the system keyword...
+            model_kwargs = dict(
                 model=self.model_name,
-                prompt=prompt.decode("utf8"),
-                max_tokens_to_sample=self.max_streaming_tokens,
-                stream=True,
+                messages=messages,
+                max_tokens=self.max_streaming_tokens,
                 temperature=temperature,
             )
+            if system_prompt is not None:
+                model_kwargs["system"] = system_prompt
+            generator = self.anthropic.messages.stream(
+                **model_kwargs,
+            )
         except Exception as e:  # TODO: add retry logic
             raise e
 
-        for part in generator:
-            chunk = part.completion or ""
-            # print(chunk)
-            yield chunk.encode("utf8")
+        with generator as stream:
+            for chunk in stream.text_stream:
+                # print(chunk)
+                yield chunk.encode("utf8")
 
 
 class Anthropic(Grammarless):
@@ -93,28 +135,6 @@ def __init__(
     ):
         """Build a new Anthropic model object that represents a model in a given state."""
 
-        # if we are called directly (as opposed to through super()) then we convert ourselves to a more specific subclass if possible
-        if self.__class__ is Anthropic:
-            found_subclass = None
-
-            # chat
-            found_subclass = AnthropicChat  # we assume all models are chat right now
-
-            # convert to any found subclass
-            self.__class__ = found_subclass
-            found_subclass.__init__(
-                self,
-                model,
-                tokenizer=tokenizer,
-                echo=echo,
-                api_key=api_key,
-                max_streaming_tokens=max_streaming_tokens,
-                timeout=timeout,
-                compute_log_probs=compute_log_probs,
-                **kwargs,
-            )
-            return  # we return since we just ran init above and don't need to run again
-
         super().__init__(
             engine=AnthropicEngine(
                 model=model,
@@ -127,16 +147,3 @@ def __init__(
             ),
             echo=echo,
         )
-
-
-class AnthropicChat(Anthropic, Chat):
-    def get_role_start(self, role_name, **kwargs):
-        if role_name == "user":
-            return "\n\nHuman:"
-        if role_name == "assistant":
-            return "\n\nAssistant:"
-        if role_name == "system":
-            return ""
-
-    def get_role_end(self, role_name=None):
-        return ""
diff --git a/guidance/models/_azure_openai.py b/guidance/models/_azure_openai.py
index afa804311..9a5420e51 100644
--- a/guidance/models/_azure_openai.py
+++ b/guidance/models/_azure_openai.py
@@ -7,11 +7,7 @@
 
 from ._grammarless import Grammarless
 from ._model import Chat, Instruct
-from ._openai import (
-    OpenAIChatEngine,
-    OpenAICompletionEngine,
-    OpenAIInstructEngine
-)
+from ._openai import OpenAIEngine
 
 try:
     import openai as openai_package
@@ -70,32 +66,6 @@ def __init__(
 
         if api_key is None and azure_ad_token_provider is None:
             raise ValueError("Please provide either api_key or azure_ad_token_provider")
-
-        # if we are called directly (as opposed to through super()) then we convert ourselves to
-        # a more specific subclass if possible
-        if self.__class__ is AzureOpenAI:
-            # Default to a completion model
-            found_subclass: Type[AzureOpenAI] = (
-                AzureOpenAICompletion
-                if model.endswith("-instruct") 
-                else AzureOpenAIChat
-            )
-
-            # convert to any found subclass
-            self.__class__ = found_subclass
-            found_subclass.__init__(
-                self,
-                model=model,
-                azure_endpoint=azure_endpoint,
-                api_key=api_key,
-                azure_ad_token_provider=azure_ad_token_provider,
-                azure_deployment=azure_deployment,
-                tokenizer=tokenizer,
-                echo=echo,
-                version=version,
-                **kwargs,
-            )
-            return
         
         parsed_url = urlparse(azure_endpoint)
 
@@ -110,17 +80,11 @@ def __init__(
             if "api-version" not in parsed_query
             else parsed_query["api-version"]
         )
-        engine_map = {
-            AzureOpenAICompletion: OpenAICompletionEngine,
-            AzureOpenAIChat: OpenAIChatEngine,
-            AzureOpenAIInstruct: OpenAIInstructEngine,
-        }
-        engine_class = engine_map[self.__class__]
 
         if tokenizer is None:
             tokenizer = tiktoken.encoding_for_model(model)
 
-        engine_instance = engine_class(
+        engine_instance = OpenAIEngine(
             tokenizer=tokenizer,
             max_streaming_tokens=max_streaming_tokens,
             timeout=timeout,
@@ -139,24 +103,3 @@ def __init__(
             engine_instance,
             echo=echo,
         )
-
-
-class AzureOpenAIChat(AzureOpenAI, Chat):
-    pass
-
-
-class AzureOpenAICompletion(AzureOpenAI):
-    pass
-
-
-class AzureOpenAIInstruct(AzureOpenAI, Instruct):
-    def get_role_start(self, name):
-        return ""
-
-    def get_role_end(self, name):
-        if name == "instruction":
-            return "<|endofprompt|>"
-        else:
-            raise ValueError(
-                f"The OpenAIInstruct model does not know about the {name} role type!"
-            )
\ No newline at end of file
diff --git a/guidance/models/_grammarless.py b/guidance/models/_grammarless.py
index 045886e50..c4cfb315b 100644
--- a/guidance/models/_grammarless.py
+++ b/guidance/models/_grammarless.py
@@ -6,7 +6,9 @@
 import re
 import logging
 from ._model import Tokenizer, Engine, Model, format_pattern, ConstraintException
+from .._chat import ChatMLTemplate
 
+import warnings
 logger = logging.getLogger(__name__)
 
 
@@ -106,7 +108,9 @@ def __init__(self, tokenizer):
 
         self._orig_tokenizer = tokenizer
 
-        super().__init__(byte_tokens, bos_token_id, eos_token_id)
+        # Grammarless Tokenizers MUST use the ChatMLTemplate in guidance today
+        chat_template = ChatMLTemplate
+        super().__init__(byte_tokens, chat_template, bos_token_id, eos_token_id)
 
     def __call__(self, byte_string):
         """Returns a list of tokens that represent the given byte string."""
@@ -118,13 +122,12 @@ def __init__(self, tokenizer, max_streaming_tokens, timeout, compute_log_probs):
         self.max_streaming_tokens = max_streaming_tokens
         self.timeout = timeout
 
-        self._data_queue = (
-            queue.Queue()
-        )  # this is where the streaming thread puts results
+        # this is where the streaming thread puts results
+        self._data_queue = queue.Queue()
         self._data = b""  # these are the bytes we are ready to use in the main thread
-        self._not_running_stream = (
-            threading.Event()
-        )  # this is phrased negatively so we can wait for the stop event
+        
+        # this is phrased negatively so we can wait for the stop event
+        self._not_running_stream = threading.Event() 
         self._last_call = 0
         self._num_calls_made = 0
         self._current_temp = 0
@@ -139,7 +142,12 @@ def __init__(self, tokenizer, max_streaming_tokens, timeout, compute_log_probs):
         if not isinstance(tokenizer, Tokenizer):
             tokenizer = GrammarlessTokenizer(tokenizer)
 
-        # build the
+        # GrammarlessEngines must use the ChatML tokenizer
+        # TODO: Consider different enforcement of this 
+        if tokenizer.chat_template is not ChatMLTemplate:
+            raise Exception("The tokenizer provided to the engine follows a non-ChatML format in its chat_template. \
+                    Using a transformers, tiktoken, or guidance.GrammarlessTokenizer directly will solve this issue.")
+        # build the Engine
         super().__init__(tokenizer=tokenizer, compute_log_probs=compute_log_probs)
 
     def __call__(self, *args, **kwargs):
diff --git a/guidance/models/_model.py b/guidance/models/_model.py
index cd5cfa296..19665bed8 100644
--- a/guidance/models/_model.py
+++ b/guidance/models/_model.py
@@ -51,6 +51,7 @@
 )
 
 from .. import _serialization_pb2
+from .._chat import load_template_class
 
 if TYPE_CHECKING:
     from ..library._block import ContextBlock
@@ -71,8 +72,8 @@ class Tokenizer:
     This class should be subclassed by specific implementations and then used as the
     tokenizer in the corresponding Engine subclass.
     """
-
-    def __init__(self, tokens, bos_token_id=None, eos_token_id=None):
+    # TODO: We should probably have encode and decode methods on here...
+    def __init__(self, tokens, chat_template, bos_token_id=None, eos_token_id=None):
 
         # a numpy array of token byte strings indexed by their token id
         if isinstance(tokens, list):
@@ -91,6 +92,11 @@ def __init__(self, tokens, bos_token_id=None, eos_token_id=None):
             self.tokens[0], bytes
         ), "The tokens need to be provided as bytes!"
 
+
+        # This method supports None, a huggingface style jinja2_template_str, or a ChatTemplate subclass
+        # Defaults to ChatML if nothing is found
+        self.chat_template = load_template_class(chat_template)
+
         self.bos_token_id = bos_token_id
         self.bos_token = (
             None if self.bos_token_id is None else self.tokens[self.bos_token_id]
@@ -205,6 +211,9 @@ def __init__(self, tokenizer, compute_log_probs=False):
 
         self.metrics = GuidanceEngineMetrics()
 
+    def get_chat_template(self): # TODO [HN]: Add more logic here...should we instantiate class here? do we even need to?
+        return self.tokenizer.chat_template() # Instantiate the class before returning to client for now
+    
     def reset_metrics(self):
         self.metrics = GuidanceEngineMetrics()
 
@@ -299,15 +308,11 @@ def next(self, logits):
             # if requested we compute the log probabilities so we can track the probabilities of each node
             if self.compute_log_probs:
                 if torch_is_imported:
-                    probs = (
-                        torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
-                        .cpu()
-                        .numpy()
-                    )  # note we don't adjust for temp since we consider that a sampling step, not part of the probs
+                    # note we don't adjust for temp since we consider that a sampling step, not part of the probs
+                    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
                 else:
-                    probs = softmax(
-                        logits, axis=-1
-                    )  # this numpy code is slower, so we don't use it if we have torch...
+                     # this numpy code is slower, so we don't use it if we have torch...
+                    probs = softmax(logits, axis=-1)
                 self.tokenizer.clean_duplicate_tokens(probs)
                 self._trie.compute_probs(probs)  # C++ impl
             else:
@@ -318,18 +323,15 @@ def next(self, logits):
 
             # get the sampling order
             if current_temp == 0:
-                sampling_order = np.argsort(
-                    -logits
-                )  # we need numpy so the enumerate below does not get really slow...
+                # we need numpy so the enumerate below does not get really slow...
+                sampling_order = np.argsort(-logits)  
             else:
                 # assert top_p == 1, "Still need to add support for top_p!"
                 if torch_is_imported:
                     logits = torch.tensor(logits)
                     torch.div(logits, current_temp, out=logits)
                     probs_torch = torch.nn.functional.softmax(logits, dim=-1)
-                    sampling_order = (
-                        torch.multinomial(probs_torch, len(probs_torch)).cpu().numpy()
-                    )
+                    sampling_order = torch.multinomial(probs_torch, len(probs_torch)).cpu().numpy()
                 else:
                     # this numpy version allows us to drop our dependence on pytorch...but it is way slower
                     if probs is None:
@@ -349,9 +351,7 @@ def next(self, logits):
                     break
 
                 # make sure it matches any forced prefix
-                used_forced_pos = min(
-                    self._forced_pos, self._start_pos + len(self._sampled_token)
-                )
+                used_forced_pos = min(self._forced_pos, self._start_pos + len(self._sampled_token))
                 if (
                     self._start_pos < self._forced_pos
                     and not self._sampled_token.startswith(
@@ -371,9 +371,9 @@ def next(self, logits):
 
                 # check to see if the sampled token is allowed
                 token_pos = offset
-                node = (
-                    self._trie
-                )  # this is the Trie node we were left at when we could force the next byte above
+
+                # this is the Trie node we were left at when we could force the next byte above
+                node = self._trie
 
                 while token_pos < len(self._sampled_token):
                     next_byte = self._sampled_token[token_pos : token_pos + 1]
@@ -382,11 +382,9 @@ def next(self, logits):
                     # if we don't have a cached match flag compute it using the grammar
                     if next_node.match_version < self._token_trie.match_version:
                         next_byte_mask = self._parser.next_byte_mask()
-                        for (
-                            byte
-                        ) in (
-                            node.keys()
-                        ):  # we update all the children since the parser knows the full mask
+
+                        # we update all the children since the parser knows the full mask
+                        for byte in node.keys():  
                             child = node.child(byte)
                             child.match_version = self._token_trie.match_version
                             child.match = next_byte_mask[byte[0]]
@@ -480,14 +478,10 @@ def next(self, logits):
                     break  # if we already have a full match we don't try more tokens we just give up as soon as the model deviates from the grammar
 
         is_done = False
-        while (
-            True
-        ):  # each iteration generates one more token (and some of the associated bytes)
+        while True:  # each iteration generates one more token (and some of the associated bytes)
             if is_new_token:
                 # emit whatever we know will not be hidden
-                new_bytes = self._parser.bytes[
-                    self._generated_pos : self._parser.earliest_hidden_start()
-                ]
+                new_bytes = self._parser.bytes[self._generated_pos : self._parser.earliest_hidden_start()]
 
                 # if we cannot consume any more tokens then we are done
                 if (
@@ -508,9 +502,7 @@ def next(self, logits):
                     #     self._cache_state["new_token_ids"].append(self._sampled_token_ind)
 
                     # capture the named groups from the parse tree
-                    self._parser.get_captures(
-                        self._captured_data, self._captured_log_prob_data
-                    )
+                    self._parser.get_captures(self._captured_data, self._captured_log_prob_data)
 
                     # we have no valid log prob data if we didn't compute it
                     # yield new_bytes[self._hidden_count:], self._is_generated, self._new_bytes_prob, self._captured_data, self._captured_log_prob_data, token_count - last_token_count
@@ -583,9 +575,9 @@ def next(self, logits):
 
             # walk down the trie as far as possible before computing the logits
             self._trie = self._token_trie
-            self._trie.match_version += (
-                1  # this invalidates all the match caches from the previous token
-            )
+            
+            # this invalidates all the match caches from the previous token
+            self._trie.match_version += 1
             # self._trie.prob = 0.0 # need to reset when we reset the match_version
             while True:
                 next_byte_mask = self._parser.next_byte_mask()
@@ -842,7 +834,9 @@ def _cleanup_tokens(self, token_ids, token_byte_positions):
             ):
                 for i in range(1, len(token_byte_positions)):
                     token_byte_positions[i] -= 1
-            assert token_byte_positions[-1] == last_pos
+            
+            
+            assert token_byte_positions[-1] == last_pos, "Cross check last_pos"
 
         return token_ids, token_byte_positions
 
@@ -869,7 +863,7 @@ class Model:
 
     Model objects are immutable representations of model state, so whenever you change
     them you get a new Model object. However, these copies share the "expensive"
-    parts of the underlying model like the the parameters and KV-cache, through a shared
+    parts of the underlying model like the parameters and KV-cache, through a shared
     Engine, so making copies of Model objects is cheap.
 
     .. automethod:: __add__
@@ -901,6 +895,7 @@ def __init__(self, engine, echo=True, **kwargs):
         #     tokenizer = Tokenizer(tokenizer)
 
         self.engine = engine
+        self.chat_template = engine.get_chat_template() # TODO [HN]: Should this be a method or attr?
         self.echo = echo
         self.token_count = 0  # tracks how many tokens our byte state represents
         self.max_display_rate = 0.2  # this controls how frequently we are allowed to redraw the display (in seconds)
@@ -909,19 +904,13 @@ def __init__(self, engine, echo=True, **kwargs):
 
         # private attributes
         self._variables = {}  # these are the state variables stored with the model
-        self._variables_log_probs = (
-            {}
-        )  # these are the state variables stored with the model
+        self._variables_log_probs = {}  # these are the state variables stored with the model
         self._cache_state = {}  # mutable caching state used to save computation
         self._state = ""  # the current bytes that represent the state of the model
         self._event_queue = None  # TODO: these are for streaming results in code, but that needs implemented
         self._event_parent = None
-        self._last_display = (
-            0  # used to track the last display call to enable throttling
-        )
-        self._last_event_stream = (
-            0  # used to track the last event streaming call to enable throttling
-        )
+        self._last_display = 0  # used to track the last display call to enable throttling
+        self._last_event_stream = 0  # used to track the last event streaming call to enable throttling
 
     @property
     def active_role_end(self):
@@ -986,17 +975,15 @@ def copy(self):
         new_lm.opened_blocks = self.opened_blocks.copy()
 
         # create a new clean event queue
-        new_lm._event_queue = (
-            None  # we start with no event queue because nobody is listening to us yet
-        )
+        new_lm._event_queue = None  # we start with no event queue because nobody is listening to us yet
+
         if self._event_queue is not None:
-            new_lm._event_parent = (
-                self  # the current lm has an event que we make it our parent
-            )
+            # if the current lm has an event queue, we make it our parent
+            new_lm._event_parent = self
+
         elif self._event_parent is not None:
-            new_lm._event_parent = (
-                self._event_parent
-            )  # otherwise if the current event que has an event parent then that is also our parent
+            # otherwise if the current event que has an event parent then that is also our parent
+            new_lm._event_parent = self._event_parent  
 
         return new_lm
 
@@ -1120,9 +1107,7 @@ def __add__(self, value):
                 lm += context.opener
                 with grammar_only():
                     tmp = lm + context.closer
-                close_text = tmp._state[
-                    len(lm._state) :
-                ]  # get the new state added by calling the closer
+                close_text = tmp._state[len(lm._state):]  # get the new state added by calling the closer
                 lm.opened_blocks[context] = (len(lm._state), close_text)
 
                 # clear out names that we override
@@ -1417,9 +1402,7 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1):
                                 except UnicodeDecodeError:
                                     pass
 
-                                if k not in lm or not isinstance(
-                                    lm._variables[k], list
-                                ):
+                                if k not in lm or not isinstance(lm._variables[k], list):
                                     lm._variables[k] = []
                                     lm._variables_log_probs[k] = []
                                 lm._variables[k].append(inner_v)
@@ -1436,9 +1419,7 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1):
                             except UnicodeDecodeError:
                                 pass
                             lm._variables[k] = v
-                            lm._variables_log_probs[k] = chunk.capture_group_log_probs[
-                                k
-                            ]
+                            lm._variables_log_probs[k] = chunk.capture_group_log_probs[k]
 
             # if len(chunk.capture_groups) > 0:
             #     for k in chunk.capture_groups:
@@ -1622,9 +1603,7 @@ def _check_dominated(node, parser, match_version, next_byte_mask):
             return False  # this child does not dominate the node, so the node is not dominated
         elif child.value is None:  # this child might not dominate the node
             parser.consume_byte(next_byte, log_prob=0.0)
-            child_dominate = _check_dominated(
-                child, parser, match_version, parser.next_byte_mask()
-            )
+            child_dominate = _check_dominated(child, parser, match_version, parser.next_byte_mask())
             parser.pos = curr_pos
             if not child_dominate:
                 return False
diff --git a/guidance/models/_openai.py b/guidance/models/_openai.py
index 43a5a6e25..85293c26a 100644
--- a/guidance/models/_openai.py
+++ b/guidance/models/_openai.py
@@ -8,7 +8,7 @@
 
 
 from ._model import Chat, Instruct
-from ._grammarless import GrammarlessEngine, Grammarless
+from ._grammarless import GrammarlessEngine, Grammarless, GrammarlessTokenizer
 
 try:
     import openai
@@ -38,129 +38,39 @@ def __init__(
         self.client = client_class(**kwargs)
         self.model_name = model
 
+        # Create a simple registry of models that use completion endpoints.
+        self._completion_models = set(
+            [
+                "gpt-35-turbo-instruct",
+                "gpt-3.5-turbo-instruct",
+                "babbage-002",
+                "davinci-002",
+            ]
+        )
+
         if tokenizer is None:
             tokenizer = tiktoken.encoding_for_model(model)
 
         super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
 
-
-class OpenAI(Grammarless):
-    def __init__(
-        self,
-        model,
-        tokenizer=None,
-        echo=True,
-        api_key=None,
-        max_streaming_tokens=1000,
-        timeout=0.5,
-        compute_log_probs=False,
-        engine_class=None,
-        **kwargs,
-    ):
-        """Build a new OpenAI model object that represents a model in a given state.
-
-        This class automatically subclasses itself into the appropriate OpenAIChat, OpenAIInstruct,
-        or OpenAICompletion subclass based on the model name.
-
-        Parameters
-        ----------
-        model : str
-            The name of the OpenAI model to use (e.g. gpt-3.5-turbo).
-        tokenizer : None or tiktoken.Encoding
-            The tokenizer to use for the given model. If set to None we use `tiktoken.encoding_for_model(model)`.
-        echo : bool
-            If true the final result of creating this model state will be displayed (as HTML in a notebook).
-        api_key : None or str
-            The OpenAI API key to use for remote requests, passed directly to the `openai.OpenAI` constructor.
-        max_streaming_tokens : int
-            The maximum number of tokens we allow this model to generate in a single stream. Normally this is set very
-            high and we rely either on early stopping on the remote side, or on the grammar terminating causing the
-            stream loop to break on the local side. This number needs to be longer than the longest stream you want
-            to generate.
-        **kwargs :
-            All extra keyword arguments are passed directly to the `openai.OpenAI` constructor. Commonly used argument
-            names include `base_url` and `organization`
-        """
-
-        if client_class is None:
-            raise Exception(
-                "Please install the openai package version >= 1 using `pip install openai -U` in order to use guidance.models.OpenAI!"
-            )
-
-        # if we are called directly (as opposed to through super()) then we convert ourselves to a more specific subclass if possible
-        if self.__class__ is OpenAI:
-
-            # instruct
-            # elif "instruct" in model: # All current OpenAI instruct models behave as Completion models.
-            #     found_subclass = OpenAIInstruct
-
-            found_subclass: typing.Type[OpenAI] = (
-                OpenAICompletion if model.endswith("-instruct") else OpenAIChat
-            )
-
-            # convert to any found subclass
-            self.__class__ = found_subclass
-            found_subclass.__init__(
-                self,
-                model,
-                tokenizer=tokenizer,
-                echo=echo,
-                api_key=api_key,
-                max_streaming_tokens=max_streaming_tokens,
-                **kwargs,
-            )
-            return  # we return since we just ran init above and don't need to run again
-
-        # this allows us to use a single constructor for all our subclasses
-        if engine_class is None:
-            engine_map = {
-                OpenAICompletion: OpenAICompletionEngine,
-                OpenAIInstruct: OpenAIInstructEngine,
-                OpenAIChat: OpenAIChatEngine,
-            }
-            for k in engine_map:
-                if issubclass(self.__class__, k):
-                    engine_class = engine_map[k]
-                    break
-
-        super().__init__(
-            engine_class(
-                tokenizer=tokenizer,
-                api_key=api_key,
-                max_streaming_tokens=max_streaming_tokens,
-                timeout=timeout,
-                compute_log_probs=compute_log_probs,
-                model=model,
-                **kwargs,
-            ),
-            echo=echo,
-        )
-
-
-class OpenAICompletion(OpenAI):
-    pass
-
-
-class OpenAICompletionEngine(OpenAIEngine):
-    def _generator(self, prompt, temperature):
-
+    def _generator_completion(self, prompt, temperature):
+        # Only runs on legacy openAI models that use old completion endpoints.
         self._reset_shared_data(prompt, temperature)  # update our shared data state
 
         try:
-            # Ideally, for the metrics we would use those returned by the
-            # OpenAI API. Unfortunately, it appears that AzureAI hosted
-            # models do not support returning metrics when streaming yet
-            prompt_string = prompt.decode("utf8")
+            prompt_decoded = prompt.decode("utf8")
             generator = self.client.completions.create(
                 model=self.model_name,
-                prompt=prompt_string,
+                prompt=prompt_decoded,
                 max_tokens=self.max_streaming_tokens,
                 n=1,
                 top_p=1.0,  # TODO: this should be controllable like temp (from the grammar)
                 temperature=temperature,
                 stream=True,
             )
-        except Exception as e:  # TODO: add retry logic
+            self.metrics.engine_input_tokens += len(self.tokenizer(prompt_decoded))
+        except Exception as e:
+            # TODO: add retry logic, but keep token counts straight
             raise e
 
         for part in generator:
@@ -168,81 +78,19 @@ def _generator(self, prompt, temperature):
                 chunk = part.choices[0].text or ""
             else:
                 chunk = ""
-            self.metrics.engine_input_tokens += len(self.tokenizer(prompt_string))
             self.metrics.engine_output_tokens += len(self.tokenizer(chunk))
             yield chunk.encode("utf8")
 
-
-class OpenAIInstruct(OpenAI, Instruct):
-    def get_role_start(self, name):
-        return ""
-
-    def get_role_end(self, name):
-        if name == "instruction":
-            return "<|endofprompt|>"
-        else:
-            raise Exception(
-                f"The OpenAIInstruct model does not know about the {name} role type!"
-            )
-
-
-class OpenAIInstructEngine(OpenAIEngine):
-    def _generator(self, prompt, temperature):
-        # start the new stream
-        eop_count = prompt.count(b"<|endofprompt|>")
-        if eop_count > 1:
-            raise Exception(
-                "This model has been given multiple instruct blocks or <|endofprompt|> tokens, but this is not allowed!"
-            )
-        updated_prompt = prompt + b"<|endofprompt|>" if eop_count == 0 else prompt
-
-        self._reset_shared_data(updated_prompt, temperature)
-
-        try:
-            generator = self.client.completions.create(
-                model=self.model_name,
-                prompt=self._shared_state["data"].decode("utf8"),
-                max_tokens=self.max_streaming_tokens,
-                n=1,
-                top_p=1.0,  # TODO: this should be controllable like temp (from the grammar)
-                temperature=temperature,
-                stream=True,
-            )
-        except Exception as e:  # TODO: add retry logic
-            raise e
-
-        for part in generator:
-            if len(part.choices) > 0:
-                chunk = part.choices[0].text or ""
-            else:
-                chunk = ""
-
-            yield chunk.encode("utf8")
-
-
-class OpenAIChat(OpenAI, Chat):
-    pass
-
-
-class OpenAIChatEngine(OpenAIEngine):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        path = os.path.join(platformdirs.user_cache_dir("guidance"), "openai.tokens")
-        self.cache = dc.Cache(path)
-
-    def _hash_prompt(self, prompt):
-        return hashlib.sha256(f"{prompt}".encode()).hexdigest()
-
-    def _generator(self, prompt, temperature):
-
+    def _generator_chat(self, prompt, temperature):
         # find the role tags
         pos = 0
-        role_end = b"<|im_end|>"
+        role_end = b"<|im_end|>\n"
         messages = []
         found = True
         input_token_count = 0
-        while found:
 
+        # TODO: refactor this to method on parent class? (or a util function)
+        while found:
             # find the role text blocks
             found = False
             for role_name, start_bytes in (
@@ -278,16 +126,6 @@ def _generator(self, prompt, temperature):
         # Update shared data state
         self._reset_shared_data(prompt[:pos], temperature)
 
-        # Use cache only when temperature is 0
-        if temperature == 0:
-            cache_key = self._hash_prompt(prompt)
-
-            # Check if the result is already in the cache
-            if cache_key in self.cache:
-                for chunk in self.cache[cache_key]:
-                    yield chunk
-                return
-
         # API call and response handling
         try:
             # Ideally, for the metrics we would use those returned by the
@@ -304,9 +142,6 @@ def _generator(self, prompt, temperature):
             )
             self.metrics.engine_input_tokens += input_token_count
 
-            if temperature == 0:
-                cached_results = []
-
             for part in generator:
                 if len(part.choices) > 0:
                     chunk = part.choices[0].delta.content or ""
@@ -316,12 +151,66 @@ def _generator(self, prompt, temperature):
                 self.metrics.engine_output_tokens += len(self.tokenizer(chunk))
                 yield encoded_chunk
 
-                if temperature == 0:
-                    cached_results.append(encoded_chunk)
+        except Exception as e:
+            # TODO: add retry logic, keeping mind of token counts
+            raise e
 
-            # Cache the results after the generator is exhausted
-            if temperature == 0:
-                self.cache[cache_key] = cached_results
+    def _generator(self, prompt, temperature):
+        if self.model_name in self._completion_models:
+            return self._generator_completion(prompt, temperature)
+        else:
+            # Otherwise we are in a chat context
+            return self._generator_chat(prompt, temperature)
 
-        except Exception as e:  # TODO: add retry logic
-            raise e
+
+class OpenAI(Grammarless):
+    def __init__(
+        self,
+        model,
+        tokenizer=None,
+        echo=True,
+        api_key=None,
+        max_streaming_tokens=1000,
+        timeout=0.5,
+        compute_log_probs=False,
+        **kwargs,
+    ):
+        """Build a new OpenAI model object that represents a model in a given state.
+
+        Parameters
+        ----------
+        model : str
+            The name of the OpenAI model to use (e.g. gpt-3.5-turbo).
+        tokenizer : None or tiktoken.Encoding
+            The tokenizer to use for the given model. If set to None we use `tiktoken.encoding_for_model(model)`.
+        echo : bool
+            If true the final result of creating this model state will be displayed (as HTML in a notebook).
+        api_key : None or str
+            The OpenAI API key to use for remote requests, passed directly to the `openai.OpenAI` constructor.
+        max_streaming_tokens : int
+            The maximum number of tokens we allow this model to generate in a single stream. Normally this is set very
+            high and we rely either on early stopping on the remote side, or on the grammar terminating causing the
+            stream loop to break on the local side. This number needs to be longer than the longest stream you want
+            to generate.
+        **kwargs :
+            All extra keyword arguments are passed directly to the `openai.OpenAI` constructor. Commonly used argument
+            names include `base_url` and `organization`
+        """
+
+        if client_class is None:
+            raise Exception(
+                "Please install the openai package version >= 1 using `pip install openai -U` in order to use guidance.models.OpenAI!"
+            )
+
+        super().__init__(
+            engine=OpenAIEngine(
+                tokenizer=tokenizer,
+                api_key=api_key,
+                max_streaming_tokens=max_streaming_tokens,
+                timeout=timeout,
+                compute_log_probs=compute_log_probs,
+                model=model,
+                **kwargs,
+            ),
+            echo=echo,
+        )
diff --git a/guidance/models/_remote.py b/guidance/models/_remote.py
index 309c6f68b..8ea7fecce 100644
--- a/guidance/models/_remote.py
+++ b/guidance/models/_remote.py
@@ -2,19 +2,32 @@
 import os
 import base64
 
-from ._model import Engine, EngineCallResponse
-
+from ._model import Tokenizer, Engine, EngineCallResponse
+from .._chat import ChatMLTemplate
+from ._grammarless import GrammarlessTokenizer
 
 class RemoteEngine(Engine):
     """This connects to a remote guidance server and runs all computation using the remote engine."""
 
-    def __init__(self, server_url, api_key, verify=None):
+    def __init__(self, server_url, api_key, tokenizer=None, verify=None):
         self.server_url = server_url
         self.api_key = api_key
         if verify is None:
             verify = os.getenv("GUIDANCE_SSL_CERTFILE", None)
         self.verify_crt = verify
 
+        # If tokenizer is not already an instance of Tokenizer, then instantiate it as a GrammarlessTokenizer
+        if not isinstance(tokenizer, Tokenizer):
+            tokenizer = GrammarlessTokenizer(tokenizer)
+
+        # GrammarlessEngines must use the ChatML tokenizer
+        # TODO: Consider different enforcement of this 
+        if tokenizer.chat_template is not ChatMLTemplate:
+            raise Exception("The tokenizer provided to the engine follows a non-ChatML format in its chat_template. \
+                    Using a transformers, tiktoken, or guidance.GrammarlessTokenizer directly will solve this issue.")
+        # build the Engine
+        super().__init__(tokenizer=tokenizer, compute_log_probs=False)
+
     def __call__(self, parser, grammar, ensure_bos_token=True):
         # Prepare the request data
         data = {
@@ -38,8 +51,7 @@ def __call__(self, parser, grammar, ensure_bos_token=True):
             response.raise_for_status()
 
         # Process and yield the response data
-        for chunk in response.iter_content(
-            chunk_size=None
-        ):  # chunk_size=None means it'll stream the content
+        # chunk_size=None means it'll stream the content
+        for chunk in response.iter_content(chunk_size=None):
             response_data = EngineCallResponse.deserialize(chunk)
             yield response_data
diff --git a/guidance/models/_togetherai.py b/guidance/models/_togetherai.py
index ee26e8c48..bcac51f85 100644
--- a/guidance/models/_togetherai.py
+++ b/guidance/models/_togetherai.py
@@ -1,10 +1,7 @@
 import os
 from ._model import Chat, Instruct
 from ._openai import (
-    OpenAIChatEngine,
     OpenAI,
-    OpenAIInstructEngine,
-    OpenAICompletionEngine,
     OpenAIEngine,
 )
 from .transformers._transformers import TransformersTokenizer
@@ -45,10 +42,10 @@ def __init__(
 
         if engine_class is None:
             engine_map = {
-                TogetherAICompletion: OpenAICompletionEngine,
-                TogetherAIInstruct: OpenAIChatEngine,
-                TogetherAIChat: OpenAIChatEngine,
-                TogetherAI: OpenAICompletionEngine,
+                TogetherAICompletion: OpenAIEngine,
+                TogetherAIInstruct: OpenAIEngine,
+                TogetherAIChat: OpenAIEngine,
+                TogetherAI: OpenAIEngine,
             }
             for k in engine_map:
                 if issubclass(self.__class__, k):
diff --git a/guidance/models/llama_cpp/__init__.py b/guidance/models/llama_cpp/__init__.py
index 3b59f762c..14b3fc34b 100644
--- a/guidance/models/llama_cpp/__init__.py
+++ b/guidance/models/llama_cpp/__init__.py
@@ -1,2 +1 @@
-from ._llama_cpp import LlamaCpp, LlamaCppChat
-from ._mistral import MistralChat, MistralInstruct
+from ._llama_cpp import LlamaCpp
diff --git a/guidance/models/llama_cpp/_llama_cpp.py b/guidance/models/llama_cpp/_llama_cpp.py
index 34231aae1..3ad421699 100644
--- a/guidance/models/llama_cpp/_llama_cpp.py
+++ b/guidance/models/llama_cpp/_llama_cpp.py
@@ -50,7 +50,7 @@ def __del__(self):
 
 
 class LlamaCppTokenizer(Tokenizer):
-    def __init__(self, model_obj):
+    def __init__(self, model_obj, chat_template=None):
         self._model_obj = model_obj
 
         tokenizer = llama_cpp.LlamaTokenizer(model_obj)
@@ -60,17 +60,18 @@ def __init__(self, model_obj):
         # get the bytes strings for all the tokens
         tokens = []
         for i in range(tokenizer.llama.n_vocab()):
-            tok = tokenizer.llama.detokenize(
-                [i]
-            )  # note that detokenize returns bytes directly
+            tok = tokenizer.llama.detokenize([i])  # note that detokenize returns bytes directly
             if tok == b"":
-                tok = llama_cpp.llama_token_get_text(
-                    model_obj.model, i
-                )  # get text rep of special tokens
+                tok = llama_cpp.llama_token_get_text(model_obj.model, i)  # get text rep of special tokens
             tokens.append(tok)
 
+        # Chat Template logic
+        if chat_template is None:
+            if hasattr(self._model_obj, "metadata") and "tokenizer.chat_template" in self._model_obj.metadata:
+                chat_template = self._model_obj.metadata["tokenizer.chat_template"]
+
         super().__init__(
-            tokens, tokenizer.llama.token_bos(), tokenizer.llama.token_eos()
+            tokens, chat_template, tokenizer.llama.token_bos(), tokenizer.llama.token_eos()
         )
 
     def __call__(self, byte_string):
@@ -80,7 +81,7 @@ def __call__(self, byte_string):
 class LlamaCppEngine(Engine):
     """The core class that runs inference using llama.cpp."""
 
-    def __init__(self, model, compute_log_probs, **kwargs):
+    def __init__(self, model, compute_log_probs, chat_template=None, **kwargs):
         if not is_llama_cpp:
             raise Exception(
                 "Please install llama-cpp-python with `pip install llama-cpp-python` in order to use guidance.models.LlamaCpp!"
@@ -133,7 +134,7 @@ def __init__(self, model, compute_log_probs, **kwargs):
         self._cache_token_ids = []
 
         super().__init__(
-            LlamaCppTokenizer(self.model_obj), compute_log_probs=compute_log_probs
+            LlamaCppTokenizer(self.model_obj, chat_template=chat_template), compute_log_probs=compute_log_probs
         )
 
         self._n_vocab = len(self.tokenizer.tokens)
@@ -217,6 +218,7 @@ def __init__(
         echo=True,
         compute_log_probs=False,
         api_key=None,
+        chat_template=None,
         **llama_cpp_kwargs,
     ):
         """Build a new LlamaCpp model object that represents a model in a given state."""
@@ -225,39 +227,7 @@ def __init__(
             engine = RemoteEngine(model, api_key=api_key, **llama_cpp_kwargs)
         else:
             engine = LlamaCppEngine(
-                model, compute_log_probs=compute_log_probs, **llama_cpp_kwargs
+                model, compute_log_probs=compute_log_probs, chat_template=chat_template, **llama_cpp_kwargs
             )
 
-        super().__init__(engine, echo=echo)
-
-
-class LlamaCppChat(LlamaCpp, Chat):
-    def get_role_start(self, role_name, **kwargs):
-        if role_name == "user":
-
-            # if we follow an auto-nested system role then we are done
-            if self._current_prompt().endswith("\n<</SYS>>\n\n"):
-                return ""
-            else:
-                return "[INST] "
-
-        elif role_name == "assistant":
-            return " "
-
-        elif role_name == "system":
-
-            # check if we are already embedded at the top of a user role
-            if self._current_prompt().endswith("[INST] "):
-                return "<<SYS>>\n"
-
-            # if not then we auto nest ourselves
-            else:
-                return "[INST] <<SYS>>\n"
-
-    def get_role_end(self, role_name=None):
-        if role_name == "user":
-            return " [/INST]"
-        elif role_name == "assistant":
-            return " "
-        elif role_name == "system":
-            return "\n<</SYS>>\n\n"
+        super().__init__(engine, echo=echo)
\ No newline at end of file
diff --git a/guidance/models/llama_cpp/_mistral.py b/guidance/models/llama_cpp/_mistral.py
deleted file mode 100644
index c7eddf182..000000000
--- a/guidance/models/llama_cpp/_mistral.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from .._model import Instruct
-from ._llama_cpp import LlamaCpp, LlamaCppChat
-
-
-class MistralChat(LlamaCppChat):
-    def get_role_start(self, role_name, **kwargs):
-        if role_name == "user":
-            return "[INST] "
-
-        elif role_name == "assistant":
-            return ""
-
-        elif role_name == "system":
-            raise Exception("MistralChat does not support a sytem role!")
-
-    def get_role_end(self, role_name=None):
-        if role_name == "user":
-            return " [/INST]"
-        elif role_name == "assistant":
-            return "</s>"
-        elif role_name == "system":
-            raise Exception("MistralChat does not support a sytem role!")
-
-
-class MistralInstruct(LlamaCpp, Instruct):
-    def get_role_start(self, role_name, **kwargs):
-        if role_name == "instruction":
-            return "[INST] "
-
-    def get_role_end(self, role_name=None):
-        if role_name == "instruction":
-            return " [/INST]"
diff --git a/guidance/models/transformers/__init__.py b/guidance/models/transformers/__init__.py
index 4d5c298c0..ada44e2b7 100644
--- a/guidance/models/transformers/__init__.py
+++ b/guidance/models/transformers/__init__.py
@@ -1,2 +1 @@
-from ._llama import Llama, LlamaChat
-from ._transformers import Transformers, TransformersChat
+from ._transformers import Transformers
diff --git a/guidance/models/transformers/_llama.py b/guidance/models/transformers/_llama.py
deleted file mode 100644
index ae0a56d7a..000000000
--- a/guidance/models/transformers/_llama.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from ._transformers import Transformers, TransformersChat
-
-
-class Llama(Transformers):
-    pass
-
-
-class LlamaChat(TransformersChat, Llama):
-
-    def system(self):
-        """Patch up the system command to convert normal system role structure into Llama structure (nested in the first user message)."""
-        self._system_prefex = "[INST] " if str(self) == "" else ""
-        out = super().system()
-        delattr(self, "_system_prefex")
-        return out
-
-    def get_role_start(self, role_name, **kwargs):
-        if role_name == "system":
-            return self._system_prefex + "<<SYS>>\n"
-        elif role_name == "user":
-            if str(self).endswith("\n<</SYS>>\n\n"):
-                return ""  # we don't need to start anything if we are starting with a top level unnested system tag
-            else:
-                return "[INST] "
-        else:
-            return " "
-
-    def get_role_end(self, role_name=None):
-        if role_name == "system":
-            return "\n<</SYS>>\n\n"
-        elif role_name == "user":
-            return " [/INST]"
-        else:
-            return " "
diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py
index 5f395db61..c6a93fdc9 100644
--- a/guidance/models/transformers/_transformers.py
+++ b/guidance/models/transformers/_transformers.py
@@ -6,11 +6,11 @@
 except ModuleNotFoundError:
     pass
 
-from .._model import Tokenizer, Engine, Model, Chat
+from .._model import Tokenizer, Engine, Model
 
 
 class TransformersTokenizer(Tokenizer):
-    def __init__(self, model, tokenizer, ignore_bos_token=False):
+    def __init__(self, model, tokenizer, chat_template=None, ignore_bos_token=False):
         if tokenizer is None:
             tokenizer = self._tokenizer(model)
 
@@ -66,6 +66,10 @@ def __init__(self, model, tokenizer, ignore_bos_token=False):
                     reconstructed += bytes(
                         [byte_decoder[c] for c in t.convert_ids_to_tokens(i)]
                     )
+                # Check if the tokenizer has a bos_token attribute, and if it does, check if it's at the start of the reconstructed bytes
+                # Some tokenizers add this automatically as part of the call function, so we need to remove it to compare
+                if hasattr(t, "bos_token") and reconstructed.startswith(t.bos_token.encode()):
+                    reconstructed = reconstructed[len(t.bos_token) :]
             except:
                 raise ValueError(
                     f"The tokenizer being used is unable to convert a special character in {s}. For models with sentencepiece based tokenizers (e.g. llama, phi-3-mini), installing sentencepiece often fixes this issue (pip install sentencepiece)."
@@ -80,9 +84,14 @@ def __init__(self, model, tokenizer, ignore_bos_token=False):
                 )
                 byte_tokens[i] = byte_coded
 
+        # Chat Template logic
+        if chat_template is None and hasattr(self._orig_tokenizer, "chat_template"):
+            chat_template = self._orig_tokenizer.chat_template
+
         # the superclass does most of the work once we have the tokens
         super().__init__(
             byte_tokens,
+            chat_template,
             None if ignore_bos_token else tokenizer.bos_token_id,
             tokenizer.eos_token_id,
         )
@@ -128,7 +137,7 @@ def __call__(self, byte_string):
 
 
 class TransformersEngine(Engine):
-    def __init__(self, model, tokenizer, compute_log_probs, **kwargs):
+    def __init__(self, model, tokenizer, compute_log_probs, chat_template=None, **kwargs):
         # fill in default model value
         if model is None:
             model = os.environ.get("TRANSFORMERS_MODEL", None)
@@ -150,8 +159,9 @@ def __init__(self, model, tokenizer, compute_log_probs, **kwargs):
         self._cached_token_ids = []
 
         super().__init__(
-            TransformersTokenizer(model, tokenizer), compute_log_probs=compute_log_probs
+            TransformersTokenizer(model, tokenizer, chat_template), compute_log_probs=compute_log_probs
         )
+        assert self._token_trie.match
 
     def _model(self, model, **kwargs):
         # intantiate the model if needed
@@ -183,9 +193,7 @@ def _joint_tokenize(self, token_ids):
                 else:
                     used_tokens -= 1
 
-        new_ids = self.tokenizer._orig_tokenizer(
-            first_decode, add_special_tokens=False
-        )["input_ids"]
+        new_ids = self.tokenizer._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"]
         if used_tokens < len(token_ids):
             new_ids += token_ids[used_tokens:]
 
@@ -208,9 +216,7 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
         """
 
         # make sure we don't run off the end of the model
-        if len(token_ids) >= getattr(
-            self.model_obj.config, "max_position_embeddings", 1e10
-        ):
+        if len(token_ids) >= getattr(self.model_obj.config, "max_position_embeddings", 1e10):
             raise Exception(
                 f"Attempted to run a transformers model past its maximum context window size of {self.model_obj.config.max_position_embeddings}!"
             )
@@ -229,16 +235,11 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
 
         # reset the cache length according to that number of positions
         past_key_values = self._past_key_values
-        past_length = (
-            past_key_values[0][0].size(-2) if past_key_values is not None else 0
-        )
+        past_length = past_key_values[0][0].size(-2) if past_key_values is not None else 0
         if past_length > num_cached:
-            past_length = max(
-                0, num_cached - 1
-            )  # note we recompute the last token because we don't bother to handle the special case of just computing logits
-            self._past_key_values = tuple(
-                tuple(p[..., :past_length, :] for p in v) for v in past_key_values
-            )
+            # note we recompute the last token because we don't bother to handle the special case of just computing logits
+            past_length = max(0, num_cached - 1)  
+            self._past_key_values = tuple(tuple(p[..., :past_length, :] for p in v) for v in past_key_values)
         cache_token_ids[past_length:] = []
 
         # call the model
@@ -249,14 +250,8 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
                     input_ids=torch.tensor(new_token_ids).unsqueeze(0).to(self.device),
                     past_key_values=self._past_key_values,
                     use_cache=True,
-                    position_ids=torch.arange(
-                        past_length, past_length + len(new_token_ids)
-                    )
-                    .unsqueeze(0)
-                    .to(self.device),
-                    attention_mask=torch.ones(1, past_length + len(new_token_ids)).to(
-                        self.device
-                    ),
+                    position_ids=torch.arange(past_length, past_length + len(new_token_ids)).unsqueeze(0).to(self.device),
+                    attention_mask=torch.ones(1, past_length + len(new_token_ids)).to(self.device),
                     return_dict=True,
                     output_attentions=False,
                     output_hidden_states=False,
@@ -266,9 +261,7 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
             self._past_key_values = model_out.past_key_values
             cache_token_ids.extend(new_token_ids)
             # Need to add special truncating logic here for weird models that have a different output size than tokenizer vocab
-            self._cached_logits = (
-                model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy()
-            )
+            self._cached_logits = model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy()
             self.metrics.engine_input_tokens += len(new_token_ids)
             self.metrics.engine_output_tokens += 1
 
@@ -277,13 +270,9 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
 
 class Transformers(Model):
     def __init__(
-        self, model=None, tokenizer=None, echo=True, compute_log_probs=False, **kwargs
+        self, model=None, tokenizer=None, echo=True, compute_log_probs=False, chat_template=None, **kwargs
     ):
         """Build a new Transformers model object that represents a model in a given state."""
         super().__init__(
-            TransformersEngine(model, tokenizer, compute_log_probs, **kwargs), echo=echo
-        )
-
-
-class TransformersChat(Transformers, Chat):
-    pass
+            TransformersEngine(model, tokenizer, compute_log_probs, chat_template=chat_template, **kwargs), echo=echo
+        )
\ No newline at end of file
diff --git a/notebooks/art_of_prompt_design/rag.ipynb b/notebooks/art_of_prompt_design/rag.ipynb
index 79d3f452f..53ea97a1b 100644
--- a/notebooks/art_of_prompt_design/rag.ipynb
+++ b/notebooks/art_of_prompt_design/rag.ipynb
@@ -35,7 +35,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llama2 = models.LlamaCppChat(\"/home/marcotcr_google_com/work/models/llama-2-13b-chat.Q6_K.gguf\", n_gpu_layers=-1, n_ctx=4096)"
+    "llama2 = models.LlamaCpp(\"/home/marcotcr_google_com/work/models/llama-2-13b-chat.Q6_K.gguf\", n_gpu_layers=-1, n_ctx=4096)"
    ]
   },
   {
diff --git a/notebooks/art_of_prompt_design/use_clear_syntax.ipynb b/notebooks/art_of_prompt_design/use_clear_syntax.ipynb
index 5a83d20a9..78fa0e6c7 100644
--- a/notebooks/art_of_prompt_design/use_clear_syntax.ipynb
+++ b/notebooks/art_of_prompt_design/use_clear_syntax.ipynb
@@ -877,7 +877,7 @@
     "# if we have multple GPUs we can load the chat model on a different GPU with the `device` argument\n",
     "del lm\n",
     "time.sleep(call_delay_secs)\n",
-    "chat_lm = guidance.models.LlamaCppChat(downloaded_file, **model_kwargs)"
+    "chat_lm = guidance.models.LlamaCpp(downloaded_file, **model_kwargs)"
    ]
   },
   {
diff --git a/notebooks/tutorials/adding_new_models.ipynb b/notebooks/tutorials/adding_new_models.ipynb
index e7ba334be..d6a5f29b3 100644
--- a/notebooks/tutorials/adding_new_models.ipynb
+++ b/notebooks/tutorials/adding_new_models.ipynb
@@ -6,6 +6,9 @@
    "source": [
     "# Adding support for a new models\n",
     "\n",
+    "# NOTE: This notebook is now out of date and needs to be rewritten to account for ChatTemplates.\n",
+    "\n",
+    "\n",
     "Different models are tuned with different role prompt formats. If the model you are using is not already a subclass of `guidance.Model`, you can define your own new subclass with whatever role prompt format you want. Then you can use the guidance role tags and they will get translated into the correct prompt format."
    ]
   },
diff --git a/notebooks/tutorials/chat.ipynb b/notebooks/tutorials/chat.ipynb
index e0d21be06..cf6637996 100644
--- a/notebooks/tutorials/chat.ipynb
+++ b/notebooks/tutorials/chat.ipynb
@@ -72,7 +72,7 @@
    "source": [
     "from guidance import models, gen\n",
     "\n",
-    "azureai_model = models.AzureOpenAIChat(\n",
+    "azureai_model = models.AzureOpenAI(\n",
     "    model=model,\n",
     "    azure_endpoint=azure_endpoint,\n",
     "    azure_deployment=azure_deployment,\n",
diff --git a/tests/conftest.py b/tests/conftest.py
index dd9d499dc..ac57eea42 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,6 +16,12 @@
         name="transformers:microsoft/Phi-3-mini-4k-instruct",
         kwargs={"trust_remote_code": True},
     ),
+    "transformers_llama3cpu_8b": dict(
+        # Note that this model requires an appropriate
+        # HF_TOKEN environment variable
+        name="meta-llama/Meta-Llama-3-8B-Instruct",
+        kwargs={"trust_remote_code": True},
+    ),
     "hfllama_phi3cpu_mini_4k_instruct": dict(
         name="huggingface_hubllama:microsoft/Phi-3-mini-4k-instruct-gguf:Phi-3-mini-4k-instruct-q4.gguf",
         kwargs={"verbose": True, "n_ctx": 4096},
diff --git a/tests/models/common_chat_testing.py b/tests/models/common_chat_testing.py
index 2ed00863b..a2814a012 100644
--- a/tests/models/common_chat_testing.py
+++ b/tests/models/common_chat_testing.py
@@ -12,12 +12,10 @@ def smoke_chat(lm: models.Chat, has_system_role: bool = True):
 
     with assistant():
         lm += gen(max_tokens=10, name="text", temperature=0.5)
-        lm += "Pick a number: "
 
     print(str(lm))
     print(f"{lm.engine.metrics=}")
     assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
     assert lm.engine.metrics.engine_input_tokens > 2, "Expect some input tokens"
     assert lm.engine.metrics.engine_output_tokens > 0, "Expect some output tokens"
 
@@ -32,11 +30,9 @@ def longer_chat_1(lm: models.Chat, has_system_role: bool = True):
 
     with assistant():
         lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
 
     print(str(lm))
     assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
 
     with user():
         lm += "10. Now you pick a number between 0 and 20"
@@ -66,11 +62,9 @@ def longer_chat_2(lm: models.Chat, has_system_role: bool = True):
     # Resume the previous
     with assistant():
         lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
 
     print(str(lm))
     assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
 
     with user():
         lm += "10. Now you pick a number between 0 and 20"
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
index 1025699a7..20fd5384f 100644
--- a/tests/models/test_anthropic.py
+++ b/tests/models/test_anthropic.py
@@ -9,7 +9,7 @@
 
 def test_anthropic_chat():
     try:
-        lm = guidance.models.AnthropicChat(model="claude-instant-1.2")
+        lm = guidance.models.Anthropic(model="claude-3-haiku-20240307")
     except:
         pytest.skip("Skipping Anthropic test because we can't load the model!")
     with system():
@@ -27,23 +27,24 @@ def test_anthropic_chat():
 
 def test_anthropic_select():
     try:
-        lm = guidance.models.AnthropicChat(model="claude-instant-1.2")
+        lm = guidance.models.Anthropic(model="claude-instant-1.2")
     except:
         pytest.skip("Skipping Anthropic test because we can't load the model!")
-    with user():
-        lm += "Pick a number: "
-    with assistant():
-        lm += select(
-            ["1", "11", "111", "1111", "11111", "111111", "1111111"], name="the number"
-        )
-
-    assert str(lm)[-1] in "123"
+    
+    # We can't meaningfully test or enforce select on this model
+    with pytest.raises(guidance.models._model.ConstraintException):
+        with user():
+            lm += "Write the next number in the list: 1,2,3,4,5,6,"
+        with assistant():
+            lm += select(
+                ["harsha", "scott", "marco"], name="the number"
+            )
 
 
 def test_anthropic_chat_loop():
     # tests issue #509
     try:
-        model = guidance.models.AnthropicChat(model="claude-instant-1.2")
+        model = guidance.models.Anthropic(model="claude-3-haiku-20240307")
     except:
         pytest.skip("Skipping Anthropic test because we can't load the model!")
 
@@ -57,3 +58,21 @@ def test_anthropic_chat_loop():
 
         with assistant():
             lm += gen(name="answer", max_tokens=2)
+
+# def test_direct_anthropic_api():
+#     import anthropic
+
+#     client = anthropic.Anthropic()
+
+#     with client.messages.stream(
+#         max_tokens=10,
+#         system="You are a counting robot. Do nothing but continue counting numbers in the same format the user presented.",
+#         messages=[{"role": "user", "content": "1,2,3,4,5,"}],
+#         model="claude-3-haiku-20240307",
+#     ) as stream:
+#         text_list = []
+#         for text in stream.text_stream:
+#             print(text, end="", flush=True)
+#             text_list.append(text)
+    
+#     assert len(text_list) > 0
\ No newline at end of file
diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index 0451bd773..0d2993e4b 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -23,7 +23,7 @@ def test_azureai_openai_chat_smoke(rate_limiter):
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
     )
-    assert isinstance(lm, models.AzureOpenAIChat)
+    assert isinstance(lm, models.AzureOpenAI)
 
     common_chat_testing.smoke_chat(lm)
 
@@ -36,7 +36,7 @@ def test_azureai_openai_chat_longer_1(rate_limiter):
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
     )
-    assert isinstance(lm, models.AzureOpenAIChat)
+    assert isinstance(lm, models.AzureOpenAI)
 
     common_chat_testing.longer_chat_1(lm)
 
@@ -49,7 +49,7 @@ def test_azureai_openai_chat_longer_2(rate_limiter):
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
     )
-    assert isinstance(lm, models.AzureOpenAIChat)
+    assert isinstance(lm, models.AzureOpenAI)
 
     common_chat_testing.longer_chat_2(lm)
 
@@ -81,11 +81,14 @@ def test_azureai_openai_completion_smoke(rate_limiter):
     azureai_key = env_or_fail("AZUREAI_COMPLETION_KEY")
     model = env_or_fail("AZUREAI_COMPLETION_MODEL")
 
+    print(f"endpoint: {' '.join(azureai_endpoint)}")
+    print(f"model: {' '.join(model)}")
+
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
     )
-    assert isinstance(lm, models.AzureOpenAICompletion)
-    assert isinstance(lm.engine, models._openai.OpenAICompletionEngine)
+    assert isinstance(lm, models.AzureOpenAI)
+    assert isinstance(lm.engine, models._openai.OpenAIEngine)
 
     result = lm + "What is 2+2?" + gen(max_tokens=10, name="text")
     print(f"result: {result['text']}")
@@ -112,8 +115,8 @@ def test_azureai_openai_completion_alt_args(rate_limiter):
         api_key=azureai_key,
         azure_deployment=azureai_deployment,
     )
-    assert isinstance(lm, models.AzureOpenAICompletion)
-    assert isinstance(lm.engine, models._openai.OpenAICompletionEngine)
+    assert isinstance(lm, models.AzureOpenAI)
+    assert isinstance(lm.engine, models._openai.OpenAIEngine)
 
     result = lm + "What is 2+2?" + gen(max_tokens=10, name="text")
     print(f"result: {result['text']}")
@@ -130,7 +133,7 @@ def test_azureai_openai_chat_loop(rate_limiter):
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
     )
-    assert isinstance(lm, models.AzureOpenAIChat)
+    assert isinstance(lm, models.AzureOpenAI)
 
     for i in range(2):
         print(f"Iteration: {i}")
diff --git a/tests/models/test_chat_templates.py b/tests/models/test_chat_templates.py
new file mode 100644
index 000000000..809a16b0c
--- /dev/null
+++ b/tests/models/test_chat_templates.py
@@ -0,0 +1,39 @@
+import pytest
+
+from guidance._chat import load_template_class, CHAT_TEMPLATE_CACHE
+import transformers
+
+from ..utils import env_or_fail
+
+@pytest.mark.needs_credentials
+@pytest.mark.parametrize(
+    "model_info",
+    [
+        ("microsoft/Phi-3-mini-4k-instruct", True), # Phi-3
+        ("meta-llama/Meta-Llama-3-8B-Instruct", True), # Llama-3
+        ("meta-llama/Llama-2-7b-chat-hf", True), # Llama-2
+        ("mistralai/Mistral-7B-Instruct-v0.2", True), # Mistral-7B-Instruct-v0.2
+        ("HuggingFaceH4/zephyr-7b-beta", False) # Have a test for model not in cache
+    ],
+)
+def test_popular_models_in_cache(model_info):
+    # This test simply checks to make sure the chat_templates haven't changed, and that they're still in our cache.
+    # If this fails, the models have had their templates updated, and we need to fix the cache manually.
+    hf_token = env_or_fail("HF_TOKEN")
+
+    model_id, should_pass = model_info
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token)
+    model_chat_template = tokenizer.chat_template
+    if should_pass:
+        assert model_chat_template in CHAT_TEMPLATE_CACHE
+    else:
+        # TODO: Expand this test to verify that a warning gets thrown when a model isn't in the cache and we have to default to chatML syntax
+        assert model_chat_template not in CHAT_TEMPLATE_CACHE
+
+    
+
+# TODO: Expand testing to verify that tokenizer.apply_chat_template() produces same results as our ChatTemplate subclasses
+# once I hook up the new ChatTemplate to guidance.models.Transformers and guidance.models.LlamaCPP, we can do this
+
+
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py
index adc031a3b..141a22290 100644
--- a/tests/models/test_openai.py
+++ b/tests/models/test_openai.py
@@ -6,32 +6,32 @@
 import guidance
 from guidance import assistant, gen, select, system, user
 
-
-def test_openai_class_detection():
-    # TODO: expand this with other variants of openAI models
-    test_models = {
-        "gpt-3.5-turbo": guidance.models.OpenAIChat,
-        "gpt-4": guidance.models.OpenAIChat,
-        "gpt-4-vision-preview": guidance.models.OpenAIChat,
-        "ft:gpt-3.5-turbo": guidance.models.OpenAIChat,
-        "ft:gpt-4": guidance.models.OpenAIChat,
-        "ft:gpt-4-vision-preview": guidance.models.OpenAIChat,
-        "ft:gpt-3.5-turbo:my-org:custom_suffix:id": guidance.models.OpenAIChat,
-        "gpt-3.5-turbo-instruct": guidance.models.OpenAICompletion,
-        "ft:gpt-3.5-turbo-instruct": guidance.models.OpenAICompletion,
-        "text-curie-001": guidance.models.OpenAICompletion,
-        "ft:text-curie-001": guidance.models.OpenAICompletion,
-        "text-davinci-003": guidance.models.OpenAICompletion,
-    }
-
-    for model_name, model_class in test_models.items():
-        # setting random tokenizer and fake API key to allow this test to run without tiktoken detection errors
-        initialized_model = guidance.models.OpenAI(
-            model_name,
-            tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo"),
-            api_key="blah",
-        )
-        assert isinstance(initialized_model, model_class)
+# This is all redundant with the class unification
+# def test_openai_class_detection():
+#     # TODO: expand this with other variants of openAI models
+#     test_models = {
+#         "gpt-3.5-turbo": guidance.models.OpenAIChat,
+#         "gpt-4": guidance.models.OpenAIChat,
+#         "gpt-4-vision-preview": guidance.models.OpenAIChat,
+#         "ft:gpt-3.5-turbo": guidance.models.OpenAIChat,
+#         "ft:gpt-4": guidance.models.OpenAIChat,
+#         "ft:gpt-4-vision-preview": guidance.models.OpenAIChat,
+#         "ft:gpt-3.5-turbo:my-org:custom_suffix:id": guidance.models.OpenAIChat,
+#         "gpt-3.5-turbo-instruct": guidance.models.OpenAICompletion,
+#         "ft:gpt-3.5-turbo-instruct": guidance.models.OpenAICompletion,
+#         "text-curie-001": guidance.models.OpenAICompletion,
+#         "ft:text-curie-001": guidance.models.OpenAICompletion,
+#         "text-davinci-003": guidance.models.OpenAICompletion,
+#     }
+
+#     for model_name, model_class in test_models.items():
+#         # setting random tokenizer and fake API key to allow this test to run without tiktoken detection errors
+#         initialized_model = guidance.models.OpenAI(
+#             model_name,
+#             tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo"),
+#             api_key="blah",
+#         )
+#         assert isinstance(initialized_model, model_class)
 
 
 def test_openai_basic():
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 3a73e3127..913be4d07 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -5,6 +5,25 @@
 from ..utils import get_model
 
 
+@pytest.fixture(scope="module")
+def phi3_model(selected_model, selected_model_name):
+    if selected_model_name in ["transformers_phi3cpu_mini_4k_instruct"]:
+        return selected_model
+    else:
+        pytest.skip("Requires Phi3 model")
+
+
+@pytest.fixture(scope="module")
+def llama3_model(selected_model, selected_model_name):
+    if (
+        selected_model_name in ["transformers_llama3cpu_8b"]
+        and selected_model is not None
+    ):
+        return selected_model
+    else:
+        pytest.skip("Requires Llama3 model (needs HF_TOKEN to be set)")
+
+
 def test_gpt2():
     gpt2 = get_model("transformers:gpt2")
     lm = gpt2 + "this is a test" + gen("test", max_tokens=10)
@@ -58,27 +77,79 @@ def test_transformer_smoke_select(model_name, model_kwargs):
 
 
 @pytest.mark.skip("Don't overload the build machines")
-def test_phi3_loading():
+def test_phi3_transformers_orig():
+    import torch
+    from transformers import AutoModelForCausalLM, pipeline, AutoTokenizer
+
+    torch.random.manual_seed(0)
+    model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Phi-3-mini-4k-instruct",
+        device_map="mps",
+        trust_remote_code=True,
+    )
 
-    lm = models.Transformers(
-        r"microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
     )
-    lm += f"""Finish counting to 5: 1,2,3,4, + {gen("five", max_tokens=1)}"""
-    assert lm["five"] == "5"
 
+    generation_args = {
+        "max_new_tokens": 5,
+        "return_full_text": True,
+        "temperature": 0.0,
+        "do_sample": False,
+    }
+
+    input_text = "You are a counting bot. Just keep counting numbers. 1,2,3,4"
+    output = pipe(input_text, **generation_args)
+    assert "5" in (output[0]["generated_text"])
+
+
+def test_phi3_loading(phi3_model: models.Model):
+    lm = phi3_model
+    lm += f"""You are a counting bot. Just keep counting numbers. 1,2,3,4, <|assistant|>"""
+    lm += gen("five", max_tokens=10)
+    assert "5" in lm["five"]
 
-@pytest.mark.skip("Don't overload the build machines")
-def test_phi3_chat():
-    # TODO: Double check chat format: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
 
-    lm = models.TransformersChat(
-        r"microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True
+@pytest.mark.needs_credentials
+@pytest.mark.skip("Need to figure out auth")
+def test_llama3_chat():
+    lm = models.Transformers(
+        r"meta-llama/Meta-Llama-3-8B-Instruct", trust_remote_code=True
     )
     with system():
         lm += "You are a counting bot. Just keep counting numbers."
     with user():
         lm += "1,2,3,4"
     with assistant():
-        lm += gen(name="five", max_tokens=1)
+        lm += gen(name="five", max_tokens=10)
+
+    assert "5" in lm["five"]
+
+
+def test_phi3_failure_minimal(phi3_model: models.Model):
+    lm = phi3_model
+    # NOTE: This SHOULD NOT raise an exception, but guidance currently has a bug where
+    # directly passing in newlines next to special tokens for a tokenizer that does rstrip on those tokens
+    # (like phi-3) will cause a tokenization mismatch issue.
+    # We're leaving this test in so that we can reliably reproduce and debug this in the future.
+    with pytest.raises(AssertionError) as ae:
+        lm += f"""numbers.<|user|>\n1,2,3,4<|end|>\n<|assistant|>\n"""
+        lm += gen("five", max_tokens=10)
+    print(f"{ae.value.args=}")
+    assert ae.value.args[0] == "Cross check last_pos"
+
+
+def test_phi3_chat_fixed(phi3_model: models.Model):
+    lm = phi3_model
+
+    lm += "You are a counting bot. Just keep counting numbers."
+    with user():
+        lm += "1,2,3,4"
+    with assistant():
+        lm += gen(name="five", max_tokens=10)
 
-    assert lm["five"] == "5"
+    assert "5" in lm["five"]