Merge branch 'main' into integrates/iree

nod-ai · Jan 30, 2025 · 295530c · 295530c
2 parents 195e2d3 + 868bcbd
commit 295530c
Show file tree

Hide file tree

Showing 26 changed files with 384 additions and 169 deletions.
diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml
@@ -26,7 +26,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: mi300x-3
+    runs-on: azure-cpubuilder-linux-scale
     # runs-on: ubuntu-latest # everything else works but this throws an "out of resources" during model loading
     # TODO: make a copy of this that runs on standard runners with tiny llama instead of a 8b model
     defaults:

diff --git a/app_tests/integration_tests/llm/server_management.py b/app_tests/integration_tests/llm/server_management.py
@@ -1,12 +1,10 @@
 """Handles server lifecycle and configuration."""
-import json
 import socket
 from contextlib import closing
 from dataclasses import dataclass
 import subprocess
 import time
 import requests
-from pathlib import Path
 import sys
 from typing import Optional
 
@@ -51,7 +49,6 @@ def __init__(self, config: ServerConfig):
         self.config = config
         self.process: Optional[subprocess.Popen] = None
         self.port: Optional[int] = None
-        self.config_path: Optional[Path] = None
 
     @staticmethod
     def find_available_port() -> int:
@@ -61,42 +58,23 @@ def find_available_port() -> int:
             s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
             return s.getsockname()[1]
 
-    def _write_config(self) -> Path:
-        """Creates server config by extending the exported model config."""
-        # TODO: eliminate this by moving prefix sharing algorithm to be a cmdline arg of server.py
-        source_config_path = self.config.artifacts.config_path
-        server_config_path = (
-            source_config_path.parent
-            / f"server_config_{self.config.prefix_sharing_algorithm}.json"
-        )
-
-        # Read the exported config as base
-        with open(source_config_path) as f:
-            config = json.load(f)
-        config["paged_kv_cache"][
-            "prefix_sharing_algorithm"
-        ] = self.config.prefix_sharing_algorithm
-        with open(server_config_path, "w") as f:
-            json.dump(config, f)
-        return server_config_path
-
     def start(self) -> None:
         """Starts the server process."""
         if self.process is not None:
             raise RuntimeError("Server is already running")
 
-        self.config_path = self._write_config()
         self.port = self.find_available_port()
 
         cmd = [
             sys.executable,
             "-m",
             "shortfin_apps.llm.server",
             f"--tokenizer_json={self.config.artifacts.tokenizer_path}",
-            f"--model_config={self.config_path}",
+            f"--model_config={self.config.artifacts.config_path}",
             f"--vmfb={self.config.artifacts.vmfb_path}",
             f"--parameters={self.config.artifacts.weights_path}",
             f"--port={self.port}",
+            f"--prefix_sharing_algorithm={self.config.prefix_sharing_algorithm}",
         ]
         cmd.extend(self.config.device_settings.server_flags)
 

diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -115,7 +115,7 @@ def generate_params_json(
             "paged_kv_cache": {
                 "attention_head_count_kv": hp.attention_head_count_kv,
                 "block_seq_stride": llama_config.block_seq_stride,
-                "device_block_count": 256,  # so that this makes its way into the config file & can be edited.
+                "device_block_count": args.device_block_count,  # so that this makes its way into the config file & can be edited.
             },
         }
 

diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
@@ -6,15 +6,11 @@
 
 """Inference support for the PagedLLMV1 protocol of models."""
 
-from typing import Optional
-
-from safetensors import safe_open
 
 import math
-import sys
-
+from ..models.llama.tools.data_utils import write_ndarray_to_bin
 import torch
-
+import numpy as np
 from ..layers import *
 from ..types import *
 
@@ -39,15 +35,14 @@ def __init__(
         page_cache_size: int = 128,
         # Need to look at the model more for this.
         end_token: int = 2,
+        dump_bins: bool = False,
     ):
         self.model = model
         self.tokenizer = tokenizer
-        if self.model.config.kv_cache_type == "paged":
-            self.shared_cache_state = model.cache.allocate(page_cache_size)
-            self.free_pages = list(range(1, page_cache_size))
-        else:
-            self.shared_cache_state = None
+        self.shared_cache_state = model.cache.allocate(page_cache_size)
+        self.free_pages = list(range(1, page_cache_size))
         self.end_token = end_token
+        self.dump_bins = dump_bins
 
     @property
     def block_seq_stride(self) -> int:
@@ -64,13 +59,9 @@ def begin_batch(self, prompts: list[str]):
             cache_state = self.shared_cache_state
         else:
             cache_state = self.model.cache.allocate(bs=len(prompts))
-        return Batch(self, token_ids, seq_lens, cache_state)
+        return Batch(self, token_ids, seq_lens, cache_state, dump_bins=self.dump_bins)
 
     def alloc_page(self) -> int:
-        if self.model.config.kv_cache_type == "direct":
-            # We don't allocate block ids for the direct cache.
-            return 0
-
         return self.free_pages.pop()
 
     def release_page(self, index: int):
@@ -86,6 +77,7 @@ def __init__(
         token_ids: torch.Tensor,
         seq_lens: torch.Tensor,
         cache_state: list[torch.Tensor],
+        dump_bins: bool = False,
     ):
         self.bs = token_ids.shape[0]
         assert seq_lens.shape[0] == self.bs
@@ -95,6 +87,7 @@ def __init__(
         self.cache_state = cache_state
         self.results: list[list[int]] = [[] for _ in range(self.bs)]
         self.done_result_indices: set[int] = set()
+        self.dump_bins = dump_bins
 
         # Assemble the batch.
         seq_stride = self.parent.block_seq_stride
@@ -160,6 +153,23 @@ def prefill(self):
             attention_mask = replicate(attention_mask, tp)
             seq_block_ids_tensor = replicate(seq_block_ids_tensor, tp)
 
+        if self.dump_bins:
+            write_ndarray_to_bin(
+                token_ids.numpy(),
+                f"prefill_token_ids_{'x'.join([str(x) for x in token_ids.shape])}xi64.bin",
+            )
+            write_ndarray_to_bin(
+                np.array(token_ids.shape[0], dtype=np.int64),
+                f"prefill_seq_lens_1xi64.bin",
+            )
+            write_ndarray_to_bin(
+                seq_block_ids_tensor.numpy(),
+                f"prefill_seq_block_ids_{'x'.join([str(x) for x in seq_block_ids_tensor.shape])}xi64.bin",
+            )
+            write_ndarray_to_bin(
+                self.cache_state[0].to(torch.float8_e4m3fnuz).to(torch.uint8).numpy(),
+                f"prefill_cache_state_{'x'.join([str(x) for x in self.cache_state[0].shape])}xf8E4M3FNUZ.bin",
+            )
         logits = model.prefill(
             token_ids,
             attention_mask=attention_mask,
@@ -204,6 +214,27 @@ def decode(self):
             seq_block_ids_tensor = replicate(seq_block_ids_tensor, tp)
             decode_attention_mask = replicate(decode_attention_mask, tp)
 
+        if self.dump_bins:
+            write_ndarray_to_bin(
+                self.next_tokens.numpy(),
+                f"decode_next_tokens_{'x'.join([str(x)for x in self.next_tokens.shape])}xi64.bin",
+            )
+            write_ndarray_to_bin(
+                start_positions.numpy(),
+                f"decode_start_positions_{'x'.join([str(x)for x in start_positions.shape])}xi64.bin",
+            )
+            write_ndarray_to_bin(
+                seq_block_ids_tensor.numpy(),
+                f"decode_seq_block_ids_tensor_{'x'.join([str(x)for x in seq_block_ids_tensor.shape])}xi64.bin",
+            )
+            write_ndarray_to_bin(
+                torch.tensor(self.next_tokens.shape[0]).to(torch.int64).numpy(),
+                f"decode_seq_lens_1xi64.bin",
+            )
+            write_ndarray_to_bin(
+                self.cache_state[0].to(torch.float8_e4m3fnuz).to(torch.uint8).numpy(),
+                f"decode_cache_state_{'x'.join([str(x) for x in self.cache_state[0].shape])}xf8E4M3FNUZ.bin",
+            )
         logits = model.decode(
             self.next_tokens,
             attention_mask=decode_attention_mask,
@@ -238,6 +269,11 @@ def main():
         "--save_intermediates_path",
         help="save module forward outputs to safetensors, ex: run_0 will save to run_0_prefill.savetensors",
     )
+    parser.add_argument(
+        "--dump-bins",
+        help="dump input tensors to bin files",
+        action="store_true",
+    )
     cli.add_input_dataset_options(parser)
     cli.add_tokenizer_options(parser)
     cli.add_quantization_options(parser)
@@ -274,7 +310,7 @@ def main():
 
         intermediates_saver = SaveModuleResultTensorsPatch()
         intermediates_saver.patch_child_modules(model)
-    generator = TorchGenerator(model, tokenizer)
+    generator = TorchGenerator(model, tokenizer, dump_bins=args.dump_bins)
 
     print(f":: Prompting:")
     for prompt in prompts:

diff --git a/sharktank/sharktank/kernels/bitcast.py b/sharktank/sharktank/kernels/bitcast.py
@@ -31,6 +31,7 @@
 ]
 
 _ftype_to_ctype_table = {
+    torch.bfloat16: torch.complex32,
     torch.float16: torch.complex32,
     torch.float32: torch.complex64,
 }

diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -49,7 +49,7 @@ def from_gguf_props(p: dict[str, Any]):
         name_prefix = p.get("general.architecture", "llama")
         default_expert_count = 0
         default_expert_used_count = 0
-        default_rope_freq_base = 10000.0
+        default_rope_freq_base = 500000.0
         default_rope_dimension_count = 128
         attention_head_count = _int_prop(p, f"{name_prefix}.attention.head_count")
         rope_dimension_count = _optional_int_prop(

diff --git a/sharktank/sharktank/layers/ffn_block.py b/sharktank/sharktank/layers/ffn_block.py
@@ -25,15 +25,20 @@ def __init__(
         theta: Theta,
         is_gated: bool = True,
         activation_fn: Callable[[AnyTensor], AnyTensor] = F.silu,
+        fake_quant: bool = False,
     ):
         super().__init__(theta)
 
         self.is_gated = is_gated
         self.activation_fn = activation_fn
         if self.is_gated:
-            self.add_module("ffn_gate", LinearLayer(theta("ffn_gate")))
-        self.add_module("ffn_up", LinearLayer(theta("ffn_up")))
-        self.add_module("ffn_down", LinearLayer(theta("ffn_down")))
+            self.add_module(
+                "ffn_gate", LinearLayer(theta("ffn_gate"), fake_quant=fake_quant)
+            )
+        self.add_module("ffn_up", LinearLayer(theta("ffn_up"), fake_quant=fake_quant))
+        self.add_module(
+            "ffn_down", LinearLayer(theta("ffn_down"), fake_quant=fake_quant)
+        )
 
     def forward(
         self,

diff --git a/sharktank/sharktank/layers/linear.py b/sharktank/sharktank/layers/linear.py
@@ -78,7 +78,6 @@ def forward(self, x):
             x = qdq_input.quantize(x).unpack().dequant()
 
         y = ops.linear(x, weight, bias)
-
         # Unconditionally dequantize.
         if isinstance(y, QuantizedTensor):
             y = y.unpack().dequant()
@@ -88,7 +87,7 @@ def forward(self, x):
         # level to do this, but for now its here.
         if not isinstance(y, QuantizedTensor):
             if y.dtype == torch.float8_e4m3fnuz:
-                y = ops.to(y, torch.float16)
+                y = ops.to(y, torch.bfloat16)
                 return y
         if qdq_output is not None:
             y = qdq_output.quantize(y).unpack().dequant()

diff --git a/sharktank/sharktank/layers/norm.py b/sharktank/sharktank/layers/norm.py
@@ -34,7 +34,7 @@ def __init__(
     def forward(self, x: torch.Tensor):
         orig_dtype = x.dtype
         x = ops.to(x, self.dtype)
-        norm = ops.rms_norm(x, self.weight, epsilon=self.epsilon)
+        norm = ops.rms_norm(x, self.weight, epsilon=self.epsilon, orig_dtype=orig_dtype)
         # Will automatically upcast to the dtype of the weight, which is
         # often in higher precision. Downcast back to expected.
         norm = ops.to(norm, orig_dtype)

diff --git a/sharktank/sharktank/layers/paged_llama_attention_block.py b/sharktank/sharktank/layers/paged_llama_attention_block.py
@@ -100,9 +100,7 @@ def forward(
         cache_state: list[torch.Tensor] = None,
     ):
         assert bool(start_index is not None) ^ bool(embedding_batch_mask is not None)
-
         x = self.attn_norm(h)
-
         bs, batch_seq_len, feature_dim = x.shape
         assert feature_dim == self.head_count * self.head_dim
 
@@ -128,22 +126,7 @@ def forward(
 
         # Used by fp8_e4m3fnuz model
         if self.cache_quantizer is not None:
-            # For fake quant, store the fp16 qdq value in the cache
-            if self.fake_quant:
-                xk = (
-                    self.cache_quantizer.quantize(xk)
-                    .unpack()
-                    .dequant()
-                    .to(torch.float16)
-                )
-                xv = (
-                    self.cache_quantizer.quantize(xv)
-                    .unpack()
-                    .dequant()
-                    .to(torch.float16)
-                )
-            # For real quant, store the quantized fp8 value in the cache
-            else:
+            if not self.fake_quant:
                 # TODO: this seems like a bastardization of our quantized tensor api
                 # Probably want to add support for using quantized tensors more directly
                 xk = self.cache_quantizer.quantize(xk).unpack().qs
@@ -175,11 +158,14 @@ def repeat_kv(x: torch.Tensor) -> torch.Tensor:
         # Fake quant is already dequantized when stored in the cache.
         if self.cache_quantizer and not self.fake_quant:
             xk = self.cache_quantizer.dequantize_raw_tensor(
-                xk, torch.float16, name="xk_deq"
+                xk, torch.bfloat16, name="xk_deq"
             )
             xv = self.cache_quantizer.dequantize_raw_tensor(
-                xv, torch.float16, name="xv_deq"
+                xv, torch.bfloat16, name="xv_deq"
             )
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(torch.bfloat16)
+
         # Transpose into [bs, heads, sl, dim]
         xq = xq.transpose(1, 2)
         keys = xk.transpose(1, 2)
@@ -223,7 +209,6 @@ def repeat_kv(x: torch.Tensor) -> torch.Tensor:
 
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.flatten(2, 3)
-
         # Project.
         attn_output = self.attn_output(attn_output)
         attn_output = self.attn_output_norm(attn_output)

diff --git a/sharktank/sharktank/models/llama/llama.py b/sharktank/sharktank/models/llama/llama.py
@@ -18,6 +18,10 @@
 from ...utils.create_cache import *
 from ... import ops
 
+
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+
 __all__ = [
     "PagedLlamaModelV1",
 ]
@@ -82,7 +86,7 @@ def __init__(self, theta: Theta, config: LlamaModelConfig):
 
         self.add_module(
             "token_embedding",
-            TokenEmbeddingLayer(theta("token_embd"), dtype=config.activation_dtype),
+            TokenEmbeddingLayer(theta("token_embd"), dtype=self.activation_dtype),
         )
         self.add_module(
             "attention_embedding",
@@ -93,6 +97,7 @@ def __init__(self, theta: Theta, config: LlamaModelConfig):
                 device=self.device,
                 use_hf=self.use_hf,
                 tensor_parallelism_size=config.tensor_parallelism_size,
+                dtype=config.activation_dtype,
             ),
         )
         self.add_module(
@@ -258,6 +263,7 @@ def __init__(
             "ffn",
             FFN(
                 theta=theta,
+                fake_quant=fake_quant,
             ),
         )
         self.add_module(