diff --git a/Cargo.lock b/Cargo.lock index 493df0d3..e941d329 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -883,6 +883,7 @@ version = "0.2.0-dev" dependencies = [ "anyhow", "ggml-sys", + "memmap2", "rand", "thiserror", ] diff --git a/Cargo.toml b/Cargo.toml index 0309bcb3..23bcf02c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0" } spinoff = { version = "0.7.0", default-features = false, features = ["dots2"] } clap = { version = "4.1.8", features = ["derive"] } +memmap2 = "0.5.10" # Config for 'cargo dist' [workspace.metadata.dist] diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 2b5ad199..39a14fb4 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -269,6 +269,10 @@ pub struct Generate { /// option will override this if specified. #[arg(long, default_value_t = false)] pub ignore_eos: bool, + + /// Whether to use GPU acceleration when available + #[arg(long, default_value_t = false)] + pub use_gpu: bool, } impl Generate { #[cfg(all(target_os = "macos", target_arch = "aarch64"))] @@ -301,6 +305,7 @@ impl Generate { InferenceSessionConfig { memory_k_type: mem_typ, memory_v_type: mem_typ, + use_gpu: self.use_gpu, } } @@ -403,11 +408,12 @@ pub struct ModelLoad { pub lora_paths: Option>, } impl ModelLoad { - pub fn load(&self) -> Result> { + pub fn load(&self, use_gpu: bool) -> Result> { let params = ModelParameters { prefer_mmap: !self.no_mmap, context_size: self.num_ctx_tokens, lora_adapters: self.lora_paths.clone(), + use_gpu, }; let mut sp = Some(spinoff::Spinner::new( diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 5c4a4a7a..689b75c7 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -51,7 +51,7 @@ fn handle_args(args: &cli_args::BaseArgs) -> Resul fn infer(args: &cli_args::Infer) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::()?; + let model = args.model_load.load::(args.generate.use_gpu)?; let (mut session, session_loaded) = snapshot::read_or_create_session( model.as_ref(), @@ -119,7 +119,7 @@ fn infer(args: &cli_args::Infer) -> Result<()> { fn perplexity(args: &cli_args::Perplexity) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::()?; + let model = args.model_load.load::(args.generate.use_gpu)?; let (mut session, _) = snapshot::read_or_create_session( model.as_ref(), None, @@ -184,7 +184,7 @@ fn info(args: &cli_args::Info) -> Result<()> { fn prompt_tokens(args: &cli_args::PromptTokens) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); - let model = args.model_load.load::()?; + let model = args.model_load.load::(false)?; let toks = match model.vocabulary().tokenize(&prompt, false) { Ok(toks) => toks, Err(e) => { @@ -231,8 +231,8 @@ fn interactive( ) -> Result<()> { let prompt_file = args.prompt_file.contents(); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::()?; - let (mut session, session_loaded) = snapshot::read_or_create_session( + let model = args.model_load.load::(args.generate.use_gpu)?; + let (mut session, mut session_loaded) = snapshot::read_or_create_session( model.as_ref(), None, args.generate.load_session.as_deref(), @@ -250,11 +250,6 @@ fn interactive( let readline = rl.readline(">> "); match readline { Ok(raw_line) => { - let session_backup = if chat_mode { - None - } else { - Some(session.clone()) - }; let line = raw_line.replace("\\\n", "\n"); let prompt = prompt_file @@ -302,8 +297,14 @@ fn interactive( log::error!("Reply exceeds context window length"); } - if let Some(session_backup) = session_backup { - session = session_backup; + // Reload session in REPL mode + if !chat_mode { + (session, session_loaded) = snapshot::read_or_create_session( + model.as_ref(), + None, + args.generate.load_session.as_deref(), + inference_session_config, + ); } } Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => { diff --git a/crates/ggml/Cargo.toml b/crates/ggml/Cargo.toml index dea703c3..2c4f43bd 100644 --- a/crates/ggml/Cargo.toml +++ b/crates/ggml/Cargo.toml @@ -9,6 +9,7 @@ license = "MIT" [dependencies] thiserror = { workspace = true } ggml-sys = { path = "sys", version = "0.2.0-dev" } +memmap2 = { workspace = true } [dev-dependencies] rand = { workspace = true } diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 8c7a4900..9dd7d108 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -1,8 +1,6 @@ -use std::{ - os::raw::{c_int, c_void}, - ptr::NonNull, - sync::Arc, -}; +use std::{os::raw::c_int, ptr::NonNull, sync::Arc}; + +use memmap2::Mmap; use crate::{sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type}; @@ -13,23 +11,65 @@ pub struct Context { /// allocated tensors. Tensors are owned by the object, so a [`Tensor`] /// contains a `Weak` reference underneath and doesn't let you do anything /// with it if the underlying context has been deallocated. - ptr: Arc>, + pub ptr: Arc>, + + /// Memory mapping information + pub mmap: Option, + + /// Backing buffer (in case we own it) + pub buffer: Option, } impl Context { + /// Creates a new [Context] using the buffer provided as memory + pub fn init_buffer(buffer: Buffer) -> Self { + let raw = unsafe { + sys::ggml_init(sys::ggml_init_params { + mem_size: buffer.size(), + mem_buffer: buffer.data, + no_alloc: false, + }) + }; + + Self { + ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + mmap: None, + buffer: Some(buffer), + } + } + + /// Creates a new [Context] with the memory mapped file provided + pub fn init_mmap(mmap: Mmap) -> Self { + let raw = unsafe { + sys::ggml_init(sys::ggml_init_params { + mem_size: mmap.len(), + mem_buffer: std::ptr::null_mut(), + no_alloc: true, // We are mmapping so ggml does not need to allocate any memory for us + }) + }; + + Self { + ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + mmap: Some(mmap), + buffer: None, + } + } + /// Creates a new [Context] with the specified `mem_size` as a working area. pub fn init(mem_size: usize, alloc: bool) -> Self { let raw = unsafe { sys::ggml_init(sys::ggml_init_params { mem_size, - // Null here means we want ggml to own this memory. We don't - // support passing an owned buffer from the Rust side. + // Null here means we want ggml to own this memory. mem_buffer: std::ptr::null_mut(), no_alloc: !alloc, }) }; + Self { ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + mmap: None, + buffer: None, } } @@ -391,7 +431,7 @@ impl Context { /// If `scratch_buffer` is `None`, the scratch buffer will be disabled. pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) { let (size, data) = if let Some(buffer) = scratch_buffer { - (buffer.data.len(), buffer.data.as_ptr() as *mut c_void) + (buffer.size(), buffer.data) } else { (0, std::ptr::null_mut()) }; @@ -432,8 +472,7 @@ impl Context { impl Drop for Context { fn drop(&mut self) { - // SAFETY: The only non-weak copy of ptr is no longer accessible after - // this drop call. + // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call. unsafe { sys::ggml_free(self.ptr.as_ptr()); } diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 90ebf42d..79623a69 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -7,7 +7,10 @@ //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed. #![deny(missing_docs)] -use std::os::raw::{c_int, c_void}; +use std::{ + alloc::Layout, + os::raw::{c_int, c_void}, +}; mod context; mod tensor; @@ -23,6 +26,9 @@ pub(crate) use ggml_sys as sys; #[cfg(test)] mod tests; +#[cfg(feature = "metal")] +pub mod metal; + /// The type of a tensor element. pub type ElementType = Type; @@ -218,23 +224,35 @@ impl Type { /// /// See [Context::use_scratch]. pub struct Buffer { - data: Box<[u8]>, + data: *mut c_void, + layout: Layout, } +const BUFFER_ALIGN: usize = 16384; + impl Buffer { /// Creates a new buffer of the specified size. pub fn new(size: usize) -> Self { - let mut data: Vec = Vec::with_capacity(size); + let layout = Layout::from_size_align(size, BUFFER_ALIGN).unwrap(); - // SAFETY: The contents are intentionally uninitialized, as they will be passed to - // the ggml C API which will fill them with data. - #[allow(clippy::uninit_vec)] unsafe { - data.set_len(size); + Buffer { + data: std::alloc::alloc(layout).cast(), + layout, + } } + } - Buffer { - data: data.into_boxed_slice(), + /// Returns the size of the buffer in bytes + pub fn size(&self) -> usize { + self.layout.size() + } +} + +impl Drop for Buffer { + fn drop(&mut self) { + unsafe { + std::alloc::dealloc(self.data.cast(), self.layout); } } } diff --git a/crates/ggml/src/metal.rs b/crates/ggml/src/metal.rs new file mode 100644 index 00000000..a86e9273 --- /dev/null +++ b/crates/ggml/src/metal.rs @@ -0,0 +1,123 @@ +//! Metal support. +use crate::{sys::metal, Buffer, ComputationGraph, Context, Tensor}; +use std::{ffi::c_void, ptr::NonNull, sync::Arc}; + +/// Acts as a RAII-guard over a `sys::metal::ggml_metal_context`, allocating via +/// `ggml_metal_init` and dropping via `ggml_metal_free`. +pub struct MetalContext { + ptr: Arc>, + + /// References to the context that hold buffers that are used in this Metal context. As Metal does not need to copy + /// buffers to VRAM, we do need to keep the original buffers alive through this reference. + contexts: Vec>, +} + +impl MetalContext { + /// Create a new Metal context + pub fn new() -> Self { + let raw = unsafe { metal::ggml_metal_init() }; + + MetalContext { + contexts: vec![], + ptr: Arc::new(NonNull::new(raw).expect("Should not be null")), + } + } + + /// Register a buffer mapping + pub fn add_scratch_buffer(&mut self, buf: &Buffer) { + unsafe { + let raw_metal_context = self.ptr.as_ptr(); + + //Last we need to add the scratch buffers to the buffers + assert!( + metal::ggml_metal_add_buffer( + raw_metal_context, + "scratch\0".as_ptr().cast(), // FIXME: allocate string and insert number in name + buf.data, + buf.size(), + buf.size() + ), + "{}", + format!("Could not add scratch buffer to metal context") + ); + } + } + + /// Add a context's memory as buffer to this Metal context + pub fn add_context(&mut self, from_context: Arc) { + if self.ref_context(from_context.clone()) { + unsafe { + let raw_context = from_context.ptr.as_ptr(); + + let (data_ptr, data_size): (*mut c_void, usize) = + if let Some(ref mmap) = from_context.mmap { + // This is a bit naughty... + (mmap.as_ptr().cast_mut().cast(), mmap.len()) + } else { + ( + ggml_sys::ggml_get_mem_buffer(raw_context), + ggml_sys::ggml_get_mem_size(raw_context), + ) + }; + + let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context); + assert!( + metal::ggml_metal_add_buffer( + self.ptr.as_ptr(), + "wt\0".as_ptr().cast(), // FIXME provide an actual name + data_ptr, + data_size, + max_size + ), + "Could not add weight buffer to metal context" + ); + } + } + } +} + +impl Default for MetalContext { + fn default() -> Self { + Self::new() + } +} + +impl MetalContext { + /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before. + fn ref_context(&mut self, context: Arc) -> bool { + if self.contexts.iter().any(|c| c.ptr == context.ptr) { + false + } else { + self.contexts.push(context); + true + } + } + + /// Computes the specified graph using Metal. + pub fn graph_compute(&self, graph: &mut ComputationGraph) { + unsafe { + metal::ggml_metal_graph_compute( + self.ptr.as_ptr(), + &mut graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph, + ); + } + } + + /// Reads a tensor from Metal + pub fn get_tensor(&self, tensor: &Tensor) { + unsafe { + metal::ggml_metal_get_tensor( + self.ptr.as_ptr(), + tensor.ptr.as_ptr() as *mut metal::ggml_tensor, + ) + } + } +} + +impl Drop for MetalContext { + fn drop(&mut self) { + // SAFETY: The only non-weak copy of ptr is no longer accessible after + // this drop call. + unsafe { metal::ggml_metal_free(self.ptr.as_ptr()) } + } +} diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs index fea9b07b..21617711 100644 --- a/crates/ggml/sys/build.rs +++ b/crates/ggml/sys/build.rs @@ -101,11 +101,11 @@ fn main() { } fn cfg_cublas() -> bool { - cfg!(feature = "cublas") + !cfg!(target_os = "macos") && cfg!(feature = "cublas") } fn cfg_clblast() -> bool { - cfg!(feature = "clblast") + !cfg!(target_os = "macos") && cfg!(feature = "clblast") } fn cfg_metal() -> bool { @@ -179,6 +179,8 @@ fn enable_metal(build: &mut cc::Build) { build.file("llama-cpp/ggml-metal.m"); build.flag("-DGGML_USE_METAL"); + + #[cfg(not(debug_assertions))] build.flag("-DGGML_METAL_NDEBUG"); } diff --git a/crates/llm-base/Cargo.toml b/crates/llm-base/Cargo.toml index 055ecebc..029b70cf 100644 --- a/crates/llm-base/Cargo.toml +++ b/crates/llm-base/Cargo.toml @@ -20,7 +20,7 @@ thiserror = { workspace = true } partial_sort = "0.2.0" serde_bytes = "0.11" -memmap2 = "0.5.10" +memmap2 = { workspace = true } half = "2.2.1" tokenizers = "0.13.3" regex = "1.8" diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 97a92b1e..57ea0908 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -1,6 +1,10 @@ -use std::fmt::Display; +use ggml::{Buffer, ComputationGraph, Context, Tensor}; +use std::{fmt::Display, sync::Arc}; use thiserror::Error; +#[cfg(feature = "metal")] +use ggml::metal::MetalContext; + use crate::{ mulf, util, InferenceParameters, Model, OutputRequest, Prompt, TokenId, TokenUtf8Buffer, TokenizationError, @@ -12,6 +16,24 @@ use crate::{ // The specific value was copied from `llama.cpp`. const SCRATCH_SIZE: usize = 512 * 1024 * 1024; +type ScratchBuffers = [ggml::Buffer; 2]; + +fn scratch_buffers() -> ScratchBuffers { + [ + ggml::Buffer::new(SCRATCH_SIZE), + ggml::Buffer::new(SCRATCH_SIZE), + ] +} + +/// Result of graph building +pub struct GraphOutputs { + /// The output containing the model's result + pub result: Tensor, + + /// The output containing embeddings + pub embedding_result: Tensor, +} + /// An inference session represents the state of the text generation. This holds /// the full context window, as well as several additional parameters used /// during sampling. @@ -24,10 +46,10 @@ const SCRATCH_SIZE: usize = 512 * 1024 * 1024; /// to use it from multiple threads. pub struct InferenceSession { // Must be kept alive for the model - pub(crate) _session_ctx: ggml::Context, + _session_ctx: Arc, // Original size of the memory used to create this context. - pub(crate) memory_size: usize, + _memory_size: usize, // Configuration for the session. pub(crate) config: InferenceSessionConfig, @@ -59,15 +81,203 @@ pub struct InferenceSession { #[doc(hidden)] pub last_logits: Vec, - /// Scratch buffers used during inference. - /// - /// The number of scratch buffers was copied from `llama.cpp`. - /// There is no specific reason for this number, but one is insufficient. - #[doc(hidden)] - pub scratch: [ggml::Buffer; 2], + #[cfg(feature = "metal")] + metal_context: Option, + + ctx0: Context, + + n_embd: usize, + + scratch: ScratchBuffers, +} + +pub struct BuildContext<'session> { + pub ctx0: &'session Context, + pub embd: &'session Tensor, + pub memory_k: &'session Tensor, + pub memory_v: &'session Tensor, + pub scratch: &'session mut ScratchBuffers, +} + +impl<'session> BuildContext<'session> { + pub fn use_scratch(&mut self, idx: Option) { + self.ctx0.use_scratch(match idx { + None => None, + Some(idx) => Some(&mut self.scratch[idx]), + }) + } } + unsafe impl Send for InferenceSession {} impl InferenceSession { + /// Create a new InferenceSession + pub fn new( + config: InferenceSessionConfig, + n_ctx: usize, + n_layer: usize, + n_embd: usize, + n_vocab: usize, + ) -> InferenceSession { + let ctx_size = { + let mut ctx_size = 0; + ctx_size += mulf!( + n_ctx, + n_layer, + n_embd, + ggml::type_sizef(config.memory_k_type.into()) + ); // memory_k + ctx_size += mulf!( + n_ctx, + n_layer, + n_embd, + ggml::type_sizef(config.memory_v_type.into()) + ); // memory_v + ctx_size += (5 + 10 * n_layer) * 256; // object overhead + + ctx_size + }; + + let session_ctx = Arc::new(ggml::Context::init(ctx_size, true)); + + // Initialize key + value memory tensors + let n_mem = n_layer * n_ctx; + let n_elements = n_embd * n_mem; + let memory_k = session_ctx.new_tensor_1d(config.memory_k_type.into(), n_elements); + let memory_v = session_ctx.new_tensor_1d(config.memory_v_type.into(), n_elements); + ggml::set_name(&memory_k, "memory_k"); + ggml::set_name(&memory_v, "memory_v"); + + let scratch = scratch_buffers(); + + // Allocate buffer for storing intermediate values during evaluation (ctx0 backing) + // For the first run, we need to guess a maximum buffer size so we can measure + // the actual memory consumption of the temporary ggml context. + // + // These numbers are from `llama.cpp`, and could potentially be more efficient. + let buf_size = { + let buf_size_mb = if n_layer >= 80 { + 1536 + } else if n_layer >= 60 { + 1280 + } else { + 1024 + }; + buf_size_mb * 1024 * 1024 + }; + + let eval = Buffer::new(buf_size); + let ctx0 = ggml::Context::init_buffer(eval); + + // Set up Metal support + #[cfg(feature = "metal")] + let metal_context = { + if config.use_gpu { + let mut metal_context = MetalContext::new(); + metal_context.add_scratch_buffer(ctx0.buffer.as_ref().unwrap()); + + for buf in scratch.iter() { + metal_context.add_scratch_buffer(buf); + } + metal_context.add_context(session_ctx.clone()); + Some(metal_context) + } else { + None + } + }; + + InferenceSession { + _session_ctx: session_ctx, + _memory_size: ctx_size, + config, + memory_k, + memory_v, + n_past: 0, + mem_per_token: 0, + tokens: vec![], + decoded_tokens: vec![], + last_logits: vec![0.0; n_vocab], + #[cfg(feature = "metal")] + metal_context, + ctx0, + n_embd, + scratch, + } + } + + /// Compute a model (possibly building a graph in the provided closure when called for the first time and/or when parameters have) + pub fn compute( + &mut self, + #[allow(unused_variables)] model_context: Arc, + input_tokens: &[TokenId], + builder: F, + ) -> GraphOutputs + where + F: FnOnce(BuildContext) -> (ComputationGraph, GraphOutputs), + { + // Build a graph + self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap()); + let ctx0 = &self.ctx0; + let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len()); + ggml::set_name(&embd, "embd"); + + let bc = BuildContext { + ctx0, + embd: &embd, + memory_k: &self.memory_k, + memory_v: &self.memory_v, + scratch: &mut self.scratch, + }; + let (mut built_gf, built_result) = builder(bc); + + // Do Metal'y stuff + #[cfg(feature = "metal")] + { + if let Some(ref mut metal_context) = self.metal_context { + metal_context.add_context(model_context); + } + } + + // Write input tokens + unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) }; + + // Compute the graph + built_gf.build_forward_expand(&built_result.result); + + #[cfg(feature = "metal")] + { + // FIXME can only process one token at a time currently + // See https://github.com/ggerganov/llama.cpp/blob/e1886cf4fe0d0f31661dda52a4a9f34bd9b9009a/llama.cpp#L1692 + if input_tokens.len() == 1 { + if let Some(ref metal_context) = self.metal_context { + metal_context.graph_compute(&mut built_gf); + metal_context.get_tensor(&built_result.result); + } else { + ctx0.graph_compute(&mut built_gf); + } + } else { + ctx0.graph_compute(&mut built_gf); + } + } + #[cfg(not(feature = "metal"))] + { + ctx0.graph_compute(&mut built_gf); + } + + // Adjust the required memory per token if we didn't know that already + if self.mem_per_token == 0 { + self.mem_per_token = ctx0.used_mem() / self.n_embd; + } + + // Adjust n_past to new length. + self.n_past += input_tokens.len(); + + // Safety: ctx0 will linger around + GraphOutputs { + result: built_result.result.share(), + embedding_result: built_result.embedding_result.share(), + } + } + /// Feed a prompt to the model for this session. pub fn feed_prompt<'a, E: std::error::Error + 'static, P: Into>>( &mut self, @@ -384,77 +594,6 @@ impl InferenceSession { Ok(session) } } -impl InferenceSession { - /// Create a new InferenceSession - pub fn new( - config: InferenceSessionConfig, - n_ctx: usize, - n_layer: usize, - n_embd: usize, - n_vocab: usize, - ) -> InferenceSession { - let ctx_size = { - let mut ctx_size = 0; - ctx_size += mulf!( - n_ctx, - n_layer, - n_embd, - ggml::type_sizef(config.memory_k_type.into()) - ); // memory_k - ctx_size += mulf!( - n_ctx, - n_layer, - n_embd, - ggml::type_sizef(config.memory_v_type.into()) - ); // memory_v - ctx_size += (5 + 10 * n_layer) * 256; // object overhead - ctx_size - }; - - let session_ctx = ggml::Context::init(ctx_size, true); - - // Initialize key + value memory tensors - let n_mem = n_layer * n_ctx; - let n_elements = n_embd * n_mem; - let memory_k = session_ctx.new_tensor_1d(config.memory_k_type.into(), n_elements); - let memory_v = session_ctx.new_tensor_1d(config.memory_v_type.into(), n_elements); - - InferenceSession { - _session_ctx: session_ctx, - memory_size: ctx_size, - config, - memory_k, - memory_v, - n_past: 0, - mem_per_token: 0, - tokens: vec![], - decoded_tokens: vec![], - last_logits: vec![0.0; n_vocab], - scratch: scratch_buffers(), - } - } -} -impl Clone for InferenceSession { - fn clone(&self) -> Self { - let context = ggml::Context::init(self.memory_size, true); - let memory_k = context.new_tensor_1d(self.memory_k.get_type(), self.memory_k.nelements()); - let memory_v = context.new_tensor_1d(self.memory_v.get_type(), self.memory_v.nelements()); - - Self { - _session_ctx: context, - memory_size: self.memory_size, - config: self.config, - memory_k, - memory_v, - n_past: self.n_past, - mem_per_token: self.mem_per_token, - tokens: self.tokens.clone(), - decoded_tokens: self.decoded_tokens.clone(), - last_logits: self.last_logits.clone(), - scratch: scratch_buffers(), - } - } -} #[derive(Error, Debug)] /// Errors encountered during the inference process. @@ -560,14 +699,19 @@ pub struct InferenceSnapshot { pub struct InferenceSessionConfig { /// The type of the memory K tensor. pub memory_k_type: ModelKVMemoryType, + /// The type of the memory V tensor. pub memory_v_type: ModelKVMemoryType, + + /// Whether to use GPU acceleration + pub use_gpu: bool, } impl Default for InferenceSessionConfig { fn default() -> Self { Self { memory_k_type: ModelKVMemoryType::Float16, memory_v_type: ModelKVMemoryType::Float16, + use_gpu: false, } } } @@ -686,10 +830,3 @@ pub fn feed_prompt_callback<'a, E: std::error::Error + 'static>( None => Ok(InferenceFeedback::Continue), } } - -fn scratch_buffers() -> [ggml::Buffer; 2] { - [ - ggml::Buffer::new(SCRATCH_SIZE), - ggml::Buffer::new(SCRATCH_SIZE), - ] -} diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index 4889bfaf..127fedf6 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -23,9 +23,9 @@ pub use ggml; pub use ggml::Type as ElementType; pub use inference_session::{ - feed_prompt_callback, InferenceError, InferenceFeedback, InferenceRequest, InferenceResponse, - InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats, ModelKVMemoryType, - SnapshotError, + feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest, + InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats, + ModelKVMemoryType, SnapshotError, }; pub use loader::{ load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, LoadError, diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 28140882..6524a198 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -342,7 +342,7 @@ pub trait TensorLoader { /// Gets a tensor from the loader. fn load(&mut self, name: &str) -> Result; /// Finish loading the model, and extract all of the state from the loader. - fn finish(self) -> (Context, HashMap, Option); + fn finish(self) -> (Context, HashMap); } /// Load a GGML model from the `path` and configure it per the `params`. The status @@ -467,16 +467,15 @@ pub fn load( } (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size }); - let context = Context::init(ctx_size, !use_mmap); - - let (mmap, file_size) = { + let (context, file_size) = if use_mmap { let file = File::open(path)?; - let mmap = if use_mmap { - Some(unsafe { Mmap::map(&file)? }) - } else { - None - }; - (mmap, file.metadata()?.len()) + unsafe { + let mmap = Mmap::map(&file)?; + let file_size = mmap.len() as u64; + (Context::init_mmap(mmap), file_size) + } + } else { + (Context::init(ctx_size, true), file.metadata()?.len()) }; let tensors_len = tensors.len(); @@ -485,7 +484,6 @@ pub fn load( file, tensors, context, - mmap, lora_adapters, load_progress_callback: &mut load_progress_callback, loaded_tensors: Default::default(), @@ -578,7 +576,6 @@ struct MmapCompatibleLoader<'a> { file: File, tensors: HashMap, context: Context, - mmap: Option, lora_adapters: Option>, load_progress_callback: &'a mut dyn FnMut(LoadProgress), loaded_tensors: HashMap, @@ -594,7 +591,7 @@ impl TensorLoader for MmapCompatibleLoader<'_> { &self.context, &mut self.file, &self.path, - self.mmap.as_ref(), + self.context.mmap.as_ref(), ); let mut tensor = main_context.get_tensor(info)?; @@ -618,8 +615,8 @@ impl TensorLoader for MmapCompatibleLoader<'_> { Ok(tensor) } - fn finish(self) -> (Context, HashMap, Option) { - (self.context, self.loaded_tensors, self.mmap) + fn finish(self) -> (Context, HashMap) { + (self.context, self.loaded_tensors) } } diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs index 827f8368..6c378b25 100644 --- a/crates/llm-base/src/lora.rs +++ b/crates/llm-base/src/lora.rs @@ -104,6 +104,7 @@ impl LoraAdapter { patch_context_size = patch_context_size + (patch_context_size / 20); // Create a temporary context for the patching operations + // TODO: test if GPU can be enabled (make it configurable) let patch_context = ggml::Context::init(patch_context_size, true); let mut patch_file = FileContext::new(&patch_context, &mut self.file, &self.path, None); diff --git a/crates/llm-base/src/model/common.rs b/crates/llm-base/src/model/common.rs index 989e0ea0..e1a6dcce 100644 --- a/crates/llm-base/src/model/common.rs +++ b/crates/llm-base/src/model/common.rs @@ -1,41 +1,6 @@ -use ggml::{Context, Tensor}; +use ggml::Tensor; -use crate::{InferenceSession, OutputRequest, TokenId}; - -/// Common code to prepare a model to evaluate input -pub fn prepare_for_evaluate( - n_layer: usize, - session: &InferenceSession, - input_tokens: &[TokenId], -) -> (Context, Tensor) { - // For the first run, we need to guess a maximum buffer size so we can measure - // the actual memory consumption of the temporary ggml context. - // - // These numbers are from `llama.cpp`, and could potentially be more efficient. - let mut buf_size = { - let buf_size_mb = if n_layer >= 80 { - 1536 - } else if n_layer >= 60 { - 1280 - } else { - 1024 - }; - buf_size_mb * 1024 * 1024 - }; - - let n = input_tokens.len(); - if session.mem_per_token > 0 && session.mem_per_token * n > buf_size { - // add 10% to account for ggml object overhead - buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize; - }; - let ctx0 = ggml::Context::init(buf_size, true); - - let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n); - unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) }; - ggml::set_name(&embd, "embd"); - - (ctx0, embd) -} +use crate::{InferenceSession, OutputRequest}; /// Return result for just the last token pub fn read_last_token( @@ -92,14 +57,3 @@ pub fn extract_embeddings( embeddings.copy_from_slice(&all_embeddings[n_embd * (n - 1)..]); } } - -/// Update an [InferenceSession] after evaluation -pub fn update_session(session: &mut InferenceSession, ctx0: &Context, n_input: usize, n: usize) { - // Adjust the required memory per token if we didn't know that already - if session.mem_per_token == 0 { - session.mem_per_token = ctx0.used_mem() / n; - } - - // Adjust n_past to new length. - session.n_past += n_input; -} diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index 6c5b13e9..15bd8ee5 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -188,6 +188,8 @@ pub struct ModelParameters { pub context_size: usize, /// The [LoRA](https://arxiv.org/abs/2106.09685) adapters to use when loading the model. If `None`, no adapters will be used. pub lora_adapters: Option>, + /// Whether to use GPU acceleration when available + pub use_gpu: bool, } impl Default for ModelParameters { @@ -196,6 +198,7 @@ impl Default for ModelParameters { prefer_mmap: true, context_size: 2048, lora_adapters: None, + use_gpu: false, } } } diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs index 0bbd873b..a4a7fdeb 100644 --- a/crates/llm/examples/embeddings.rs +++ b/crates/llm/examples/embeddings.rs @@ -120,7 +120,7 @@ fn get_embeddings( let vocab = model.vocabulary(); let beginning_of_sentence = true; let query_token_ids = vocab - .tokenize(&format!("{}", query), beginning_of_sentence) + .tokenize(query, beginning_of_sentence) .unwrap() .iter() .map(|(_, tok)| *tok) diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index abe1489c..d44f143e 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -2,11 +2,13 @@ //! for the `llm` ecosystem. #![deny(missing_docs)] +use std::sync::Arc; + use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, }; /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom) @@ -36,8 +38,7 @@ pub struct Bloom { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for Bloom {} @@ -88,7 +89,7 @@ impl KnownModel for Bloom { layers.push(layer); } - let (_context, _, _mmap) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -103,8 +104,7 @@ impl KnownModel for Bloom { output_norm_bias, output, layers, - _context, - _mmap, + context: Arc::new(context), }) } @@ -139,225 +139,234 @@ impl KnownModel for Bloom { file_type: _, } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let mut input_layer = ctx0.op_get_rows(&self.wte, &embd); - - // normalize embeddings - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); - input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer); - - let mut gf = ggml::ComputationGraph::new(num_threads); - for il in 0..n_layer { - let input_self_attention = input_layer.share(); - let mut current: ggml::Tensor; - - // norm - current = ctx0.op_norm(&input_layer); - - // cur = attention_norm * cur - current = ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].attention_norm, ¤t), - ¤t, - ); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].attention_norm_b, ¤t), - ¤t, - ); - - //attention - current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].query_key_value_b, ¤t), - ¤t, - ); - - // self-attention - let nb = current.get_nb()[1]; - let q_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - //0 * std::mem::size_of::() * n_embd as usize, - 0, - ); - let k_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - std::mem::size_of::() * n_embd, - ); - let v_current = ctx0.op_view_2d( - ¤t, - (n_embd, input_len), - nb, - 2 * std::mem::size_of::() * n_embd, + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0; + let (memory_k_size, memory_v_size) = ( + builder.memory_k.element_size(), + builder.memory_v.element_size(), ); - - // store key and value to memory - if input_len >= 1 { - let k = ctx0.op_view_1d( - &session.memory_k, - input_len * n_embd, - (session.memory_k.element_size() * n_embd) * (il * ctx_size + session_len), + let embd = &builder.embd; + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + + // normalize embeddings + input_layer = ctx0.op_norm(&input_layer); + input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); + input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer); + + let mut gf = ggml::ComputationGraph::new(num_threads); + for il in 0..n_layer { + let input_self_attention = input_layer.share(); + let mut current: ggml::Tensor; + + // norm + current = ctx0.op_norm(&input_layer); + + // cur = attention_norm * cur + current = ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].attention_norm, ¤t), + ¤t, ); - - let v = ctx0.op_view_1d( - &session.memory_v, - input_len * n_embd, - (session.memory_v.element_size() * n_embd) * (il * ctx_size + session_len), + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].attention_norm_b, ¤t), + ¤t, ); - gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); - } + //attention + current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].query_key_value_b, ¤t), + ¤t, + ); - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - let big_q = ctx0.op_permute( - &ctx0.op_cpy( - &q_current, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - ), - (0, 2, 1, 3), - ); + // self-attention + let nb = current.get_nb()[1]; + let q_current = ctx0.op_view_2d( + ¤t, + (n_embd, input_len), + nb, + //0 * std::mem::size_of::() * n_embd as usize, + 0, + ); + let k_current = ctx0.op_view_2d( + ¤t, + (n_embd, input_len), + nb, + std::mem::size_of::() * n_embd, + ); + let v_current = ctx0.op_view_2d( + ¤t, + (n_embd, input_len), + nb, + 2 * std::mem::size_of::() * n_embd, + ); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - let big_k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - &session.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * session.memory_k.element_size() * n_embd, + // store key and value to memory + if input_len >= 1 { + let k = ctx0.op_view_1d( + builder.memory_k, + input_len * n_embd, + (memory_k_size * n_embd) * (il * ctx_size + session_len), + ); + + let v = ctx0.op_view_1d( + builder.memory_v, + input_len * n_embd, + (memory_v_size * n_embd) * (il * ctx_size + session_len), + ); + + gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + let big_q = ctx0.op_permute( + &ctx0.op_cpy( + &q_current, + &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - // K * Q - let k_q = ctx0.op_mul_mat(&big_k, &big_q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let k_q_scaled = ctx0.op_scale( - &k_q, - &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - //alibi - // KQ_scaled_alibi = KQ_scaled + alibi_bias - let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, session_len, n_head, 8f32); - - // KQ_masked = mask_past(KQ_scaled) - let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, session_len); - - // KQ = soft_max(KQ_masked) - let k_q_soft_max = ctx0.op_soft_max(&k_q_masked); - - let memv_elsize = session.memory_v.element_size(); + (0, 2, 1, 3), + ); - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + let big_k = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( - &session.memory_v, + builder.memory_k, (session_len + input_len) * n_embd, - il * ctx_size * memv_elsize * n_embd, + il * ctx_size * memory_k_size * n_embd, ), n_embd / n_head, n_head, session_len + input_len, ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - session.memory_v.get_type(), - session_len + input_len, - n_embd / n_head, - n_head, - ), - ); + (0, 2, 1, 3), + ); - let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max); + // K * Q + let k_q = ctx0.op_mul_mat(&big_k, &big_q); - // KQV_merged = KQV.permute(0, 2, 1, 3) - let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); + // KQ_scaled = KQ / sqrt(n_embd/n_head) + let k_q_scaled = ctx0.op_scale( + &k_q, + &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); - // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy( - &k_q_v_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); + //alibi + // KQ_scaled_alibi = KQ_scaled + alibi_bias + let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, session_len, n_head, 8f32); + + // KQ_masked = mask_past(KQ_scaled) + let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, session_len); + + // KQ = soft_max(KQ_masked) + let k_q_soft_max = ctx0.op_soft_max(&k_q_masked); + + let memv_elsize = memory_v_size; + + let v_trans = ctx0.op_cpy( + &ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_v, + (session_len + input_len) * n_embd, + il * ctx_size * memv_elsize * n_embd, + ), + n_embd / n_head, + n_head, + session_len + input_len, + ), + (1, 2, 0, 3), + ), + &ctx0.new_tensor_3d( + builder.memory_v.get_type(), + session_len + input_len, + n_embd / n_head, + n_head, + ), + ); - // projection - current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); - current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, ¤t), ¤t); + let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max); - let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); + // KQV_merged = KQV.permute(0, 2, 1, 3) + let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); - // feed-forward network - // norm - current = ctx0.op_norm(&input_feed_forward); + // cur = KQV_merged.contiguous().view(n_embd, N) + current = ctx0.op_cpy( + &k_q_v_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + ); - // cur = ffn_norm*cur + ffn_norm_b - current = ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].ffn_norm, ¤t), - ¤t, - ); + // projection + current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); + current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, ¤t), ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].ffn_norm_b, ¤t), - ¤t, - ); + let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); - current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); + // feed-forward network + // norm + current = ctx0.op_norm(&input_feed_forward); - current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, ¤t), ¤t); + // cur = ffn_norm*cur + ffn_norm_b + current = ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].ffn_norm, ¤t), + ¤t, + ); - // SILU activation + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].ffn_norm_b, ¤t), + ¤t, + ); - current = ctx0.op_gelu(¤t); + current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); - current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); + current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, ¤t), ¤t); - current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, ¤t), ¤t); + // SILU activation - current = ctx0.op_add(¤t, &input_feed_forward); + current = ctx0.op_gelu(¤t); - // input for next layer - input_layer = current; - } + current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); - // norm - input_layer = ctx0.op_norm(&input_layer); + current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, ¤t), ¤t); - // inpL = norm*inpL - input_layer = ctx0.op_mul( - &ctx0.op_repeat(&self.output_norm, &input_layer), - &input_layer, - ); + current = ctx0.op_add(¤t, &input_feed_forward); - input_layer = ctx0.op_add( - &ctx0.op_repeat(&self.output_norm_bias, &input_layer), - &input_layer, - ); + // input for next layer + input_layer = current; + } + + // norm + input_layer = ctx0.op_norm(&input_layer); + + // inpL = norm*inpL + input_layer = ctx0.op_mul( + &ctx0.op_repeat(&self.output_norm, &input_layer), + &input_layer, + ); + + input_layer = ctx0.op_add( + &ctx0.op_repeat(&self.output_norm_bias, &input_layer), + &input_layer, + ); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + let embeddings_tensor: ggml::Tensor = input_layer.share(); - // lm_head - input_layer = ctx0.op_mul_mat(&self.output, &input_layer); + // lm_head + input_layer = ctx0.op_mul_mat(&self.output, &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result: embeddings_tensor, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, input_len); - common::extract_logits(output_request, &input_layer, n_vocab, input_len); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - common::update_session(session, &ctx0, input_tokens.len(), input_len); + common::read_last_token(session, &outputs.result, n_vocab, input_len); + common::extract_logits(output_request, &outputs.result, n_vocab, input_len); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } fn vocabulary(&self) -> &Vocabulary { diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index 6977ccd3..abc0726d 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -1,12 +1,14 @@ //! An implementation of [GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2) for the `llm` ecosystem. #![deny(missing_docs)] +use std::sync::Arc; + use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, }; /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) @@ -35,8 +37,7 @@ pub struct Gpt2 { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for Gpt2 {} @@ -80,7 +81,7 @@ impl KnownModel for Gpt2 { layers.push(layer); } - let (_context, _, _mmap) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -94,8 +95,7 @@ impl KnownModel for Gpt2 { wte, wpe, lm_head, - _context, - _mmap, + context: Arc::new(context), }) } @@ -129,193 +129,198 @@ impl KnownModel for Gpt2 { .. } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let position_buf: Vec = (0..input_len).map(|i| (session_len + i) as i32).collect(); - - let mut position = ctx0.new_tensor_1d(ggml::Type::I32, input_len); - unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) }; - - let mut input_layer = ctx0.op_add( - &ctx0.op_get_rows(&self.wte, &embd), - &ctx0.op_get_rows(&self.wpe, &position), - ); - - let memory_k = &session.memory_k; - let memory_k_size = memory_k.element_size(); + let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { + let ctx0 = builder.ctx0; + let (memory_k_size, memory_v_size) = ( + builder.memory_k.element_size(), + builder.memory_v.element_size(), + ); + let embd = &builder.embd; - let memory_v = &session.memory_v; - let memory_v_size = memory_v.element_size(); + let position_buf: Vec = (0..input_len).map(|i| (session_len + i) as i32).collect(); - let mut gf = ggml::ComputationGraph::new(num_threads); - for il in 0..n_layer { - ctx0.use_scratch(Some(&mut session.scratch[0])); + let mut position = ctx0.new_tensor_1d(ggml::Type::I32, input_len); + unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) }; - // norm - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), - &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), + let mut input_layer = ctx0.op_add( + &ctx0.op_get_rows(&self.wte, embd), + &ctx0.op_get_rows(&self.wpe, &position), ); - // attn - current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_attn_attn_b, ¤t), - ¤t, - ); + let mut gf = ggml::ComputationGraph::new(num_threads); + for il in 0..n_layer { + builder.use_scratch(Some(0)); - // self-attn - let nb = current.get_nb()[1]; - let f32_size = std::mem::size_of::(); - let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); - let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); - let vcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); - - if input_len >= 1 { - let k = ctx0.op_view_1d( - memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_1d( - memory_v, - input_len * n_embd, - (memory_v_size * n_embd) * (il * ctx_size + session_len), + // norm + let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), + &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), ); - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - } - - let q = ctx0.op_permute( - &ctx0.op_cpy( - &qcur, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - ), - (0, 2, 1, 3), - ); + // attn + current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_attn_attn_b, ¤t), + ¤t, + ); - let k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - &session.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, + // self-attn + let nb = current.get_nb()[1]; + let f32_size = std::mem::size_of::(); + let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); + let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); + let vcur = + ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); + + if input_len >= 1 { + let k = ctx0.op_view_1d( + builder.memory_k, + input_len * n_embd, + (memory_k_size * n_embd) * (il * ctx_size + session_len), + ); + let v = ctx0.op_view_1d( + builder.memory_v, + input_len * n_embd, + (memory_v_size * n_embd) * (il * ctx_size + session_len), + ); + + gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + } + + let q = ctx0.op_permute( + &ctx0.op_cpy( + &qcur, + &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - let kq = ctx0.op_mul_mat(&k, &q); - let kq_scaled = ctx0.op_scale_inplace( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); - let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); + (0, 2, 1, 3), + ); - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( + let k = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( - memory_v, + builder.memory_k, (session_len + input_len) * n_embd, - il * ctx_size * memory_v_size * n_embd, + il * ctx_size * memory_k_size * n_embd, ), n_embd / n_head, n_head, session_len + input_len, ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - memory_v.get_type(), - session_len + input_len, - n_embd / n_head, - n_head, - ), - ); + (0, 2, 1, 3), + ); - let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + let kq = ctx0.op_mul_mat(&k, &q); + let kq_scaled = ctx0.op_scale_inplace( + &kq, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); - current = ctx0.op_cpy( - &kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); + let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); + let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); + + let v_trans = ctx0.op_cpy( + &ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_v, + (session_len + input_len) * n_embd, + il * ctx_size * memory_v_size * n_embd, + ), + n_embd / n_head, + n_head, + session_len + input_len, + ), + (1, 2, 0, 3), + ), + &ctx0.new_tensor_3d( + builder.memory_v.get_type(), + session_len + input_len, + n_embd / n_head, + n_head, + ), + ); - // projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_attn_proj_b, ¤t), - ¤t, - ); + let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); + let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - // add input - current = ctx0.op_add(¤t, &input_layer); + current = ctx0.op_cpy( + &kqv_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + ); - // feed-forward - let ff_in = current.share(); + // projection + current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_attn_proj_b, ¤t), + ¤t, + ); - ctx0.use_scratch(Some(&mut session.scratch[1])); + // add input + current = ctx0.op_add(¤t, &input_layer); - // feed-forward normalization - current = ctx0.op_norm(&ff_in); - current = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_2_g, ¤t), ¤t), - &ctx0.op_repeat(&self.layers[il].ln_2_b, ¤t), - ); + // feed-forward + let ff_in = current.share(); - // feed-forward fully connected - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_mlp_fc_b, ¤t), - ¤t, - ); + builder.use_scratch(Some(1)); - // feed-forward activation - current = ctx0.op_gelu(¤t); + // feed-forward normalization + current = ctx0.op_norm(&ff_in); + current = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_2_g, ¤t), ¤t), + &ctx0.op_repeat(&self.layers[il].ln_2_b, ¤t), + ); - // feed-forward projection - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_mlp_proj_b, ¤t), - ¤t, - ); + // feed-forward fully connected + current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_mlp_fc_b, ¤t), + ¤t, + ); - // input for next layer - input_layer = ctx0.op_add(¤t, &ff_in); - } + // feed-forward activation + current = ctx0.op_gelu(¤t); - ctx0.use_scratch(Some(&mut session.scratch[0])); + // feed-forward projection + current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_mlp_proj_b, ¤t), + ¤t, + ); + + // input for next layer + input_layer = ctx0.op_add(¤t, &ff_in); + } - // normalization - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), - &ctx0.op_repeat(&self.ln_f_b, &input_layer), - ); + builder.use_scratch(Some(0)); + + // normalization + input_layer = ctx0.op_norm(&input_layer); + input_layer = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), + &ctx0.op_repeat(&self.ln_f_b, &input_layer), + ); - ctx0.use_scratch(None); + builder.use_scratch(None); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + let embeddings_tensor: ggml::Tensor = input_layer.share(); - input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); + input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result: embeddings_tensor, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, input_len); - common::extract_logits(output_request, &input_layer, n_vocab, input_len); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - common::update_session(session, &ctx0, input_tokens.len(), input_len); + common::read_last_token(session, &outputs.result, n_vocab, input_len); + common::extract_logits(output_request, &outputs.result, n_vocab, input_len); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } fn vocabulary(&self) -> &Vocabulary { diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index d537b884..5ec7d5bc 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -1,14 +1,15 @@ //! An implementation of [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj) for the `llm` ecosystem. #![deny(missing_docs)] -use std::error::Error; +use std::{error::Error, sync::Arc}; use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - LoadError, Mmap, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, + Vocabulary, }; /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b) @@ -36,8 +37,7 @@ pub struct GptJ { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for GptJ {} @@ -82,7 +82,7 @@ impl KnownModel for GptJ { layers.push(layer); } - let (_context, _, _mmap) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -96,8 +96,7 @@ impl KnownModel for GptJ { lmh_g, lmh_b, layers, - _mmap, - _context, + context: Arc::new(context), }) } @@ -132,161 +131,165 @@ impl KnownModel for GptJ { .. } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let mut input_layer = ctx0.op_get_rows(&self.wte, &embd); - - let memory_k = &session.memory_k; - let memory_k_size = memory_k.element_size(); - - let memory_v = &session.memory_v; - let memory_v_size = memory_v.element_size(); - - let mut gf = ggml::ComputationGraph::new(num_threads); - for il in 0..n_layer { - // norm - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), - &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), - ); - - let input_sa = current.share(); - - // self-attention - let qcur = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, + let outputs = session.compute(self.context.clone(), input_tokens, |builder| { + let ctx0 = builder.ctx0; + let (memory_k_size, memory_v_size) = ( + builder.memory_k.element_size(), + builder.memory_v.element_size(), ); - let kcur = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].c_attn_k_proj_w, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, - ); - - // self-attention store key and value to memory - let vcur = - ctx0.op_transpose(&ctx0.op_mul_mat(&self.layers[il].c_attn_v_proj_w, ¤t)); - - let k = ctx0.op_view_1d( - memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_2d( - memory_v, - (input_len, n_embd), - ctx_size * memory_v_size, - (il * ctx_size) * memory_v_size * n_embd + session_len * memory_v_size, - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - let q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); - let big_k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, + let embd = builder.embd; + + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + + let mut gf = ggml::ComputationGraph::new(num_threads); + for il in 0..n_layer { + // norm + let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), + &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), + ); + + let input_sa = current.share(); + + // self-attention + let qcur = ctx0.op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, ¤t), + n_embd / n_head, + n_head, + input_len, ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - - let kq = ctx0.op_mul_mat(&big_k, &q); - let kq_scaled = ctx0.op_scale_inplace( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - - let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); - let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); - - let big_v = ctx0.op_view_3d( - memory_v, - (session_len + input_len, n_embd / n_head, n_head), - ( + session_len, + n_rot, + 0, + ); + let kcur = ctx0.op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].c_attn_k_proj_w, ¤t), + n_embd / n_head, + n_head, + input_len, + ), + session_len, + n_rot, + 0, + ); + + // self-attention store key and value to memory + let vcur = + ctx0.op_transpose(&ctx0.op_mul_mat(&self.layers[il].c_attn_v_proj_w, ¤t)); + + let k = ctx0.op_view_1d( + builder.memory_k, + input_len * n_embd, + (memory_k_size * n_embd) * (il * ctx_size + session_len), + ); + let v = ctx0.op_view_2d( + builder.memory_v, + (input_len, n_embd), ctx_size * memory_v_size, - ctx_size * memory_v_size * n_embd / n_head, - ), - il * ctx_size * memory_v_size * n_embd, - ); + (il * ctx_size) * memory_v_size * n_embd + session_len * memory_v_size, + ); + + gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + + let q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); + let big_k = ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_k, + (session_len + input_len) * n_embd, + il * ctx_size * memory_k_size * n_embd, + ), + n_embd / n_head, + n_head, + session_len + input_len, + ), + (0, 2, 1, 3), + ); + + let kq = ctx0.op_mul_mat(&big_k, &q); + let kq_scaled = ctx0.op_scale_inplace( + &kq, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); + + let kq_masked = ctx0.op_diag_mask_inf_inplace(&kq_scaled, session_len); + let kq_softmax = ctx0.op_soft_max_inplace(&kq_masked); + + let big_v = ctx0.op_view_3d( + builder.memory_v, + (session_len + input_len, n_embd / n_head, n_head), + ( + ctx_size * memory_v_size, + ctx_size * memory_v_size * n_embd / n_head, + ), + il * ctx_size * memory_v_size * n_embd, + ); - let kqv = ctx0.op_mul_mat(&big_v, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + let kqv = ctx0.op_mul_mat(&big_v, &kq_softmax); + let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - current = ctx0.op_cpy( - &kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); + current = ctx0.op_cpy( + &kqv_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + ); - // self-attention projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); + // self-attention projection + current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - // feed-forward - let ff_in = current.share(); + // feed-forward + let ff_in = current.share(); - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, &input_sa); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_mlp_fc_b, ¤t), - ¤t, - ); + current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, &input_sa); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_mlp_fc_b, ¤t), + ¤t, + ); - current = ctx0.op_gelu(¤t); + current = ctx0.op_gelu(¤t); - // feed-forward projection - current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_mlp_proj_b, ¤t), - ¤t, - ); + // feed-forward projection + current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_mlp_proj_b, ¤t), + ¤t, + ); - current = ctx0.op_add(¤t, &ff_in); + current = ctx0.op_add(¤t, &ff_in); - // input for next layer - input_layer = ctx0.op_add(¤t, &input_layer); - } + // input for next layer + input_layer = ctx0.op_add(¤t, &input_layer); + } - // norm - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), - &ctx0.op_repeat(&self.ln_f_b, &input_layer), - ); + // norm + input_layer = ctx0.op_norm(&input_layer); + input_layer = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), + &ctx0.op_repeat(&self.ln_f_b, &input_layer), + ); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + let embeddings_tensor: ggml::Tensor = input_layer.share(); - // lm_head - input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); - input_layer = ctx0.op_add(&ctx0.op_repeat(&self.lmh_b, &input_layer), &input_layer); + // lm_head + input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); + input_layer = ctx0.op_add(&ctx0.op_repeat(&self.lmh_b, &input_layer), &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result: embeddings_tensor, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, input_len); - common::extract_logits(output_request, &input_layer, n_vocab, input_len); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - common::update_session(session, &ctx0, input_tokens.len(), input_len); + common::read_last_token(session, &outputs.result, n_vocab, input_len); + common::extract_logits(output_request, &outputs.result, n_vocab, input_len); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } fn vocabulary(&self) -> &Vocabulary { diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 1aa66de7..5b4ea0c0 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -2,14 +2,15 @@ //! This crate also supports the [RedPajama](https://www.together.xyz/blog/redpajama) GPT-NeoX model. #![deny(missing_docs)] -use std::error::Error; +use std::{error::Error, sync::Arc}; use ggml::Tensor; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - LoadError, Mmap, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, + Vocabulary, }; /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox) @@ -36,8 +37,7 @@ pub struct GptNeoX { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for GptNeoX {} @@ -96,7 +96,7 @@ impl KnownModel for GptNeoX { layers.push(layer); } - let (_context, _, _mmap) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -109,8 +109,7 @@ impl KnownModel for GptNeoX { wte, lmh_g, layers, - _context, - _mmap, + context: Arc::new(context), }) } @@ -148,192 +147,195 @@ impl KnownModel for GptNeoX { .. } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let mut input_layer = ctx0.op_get_rows(&self.wte, &embd); - - let memory_k = &session.memory_k; - let memory_k_size = memory_k.element_size(); - - let memory_v = &session.memory_v; - let memory_v_size = memory_v.element_size(); - - let mut gf = ggml::ComputationGraph::new(n_threads); - - for il in 0..n_layer { - // attention uses first scratch buffer - ctx0.use_scratch(Some(&mut session.scratch[0])); - - // self-attention - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), - &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), - ); - - // self-attention compute QKV - current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_attn_attn_b, ¤t), - ¤t, - ); - - let nb = current.get_nb()[1]; - let f32_size = std::mem::size_of::(); - - let mut qcur = ctx0.op_cont(&ctx0.op_view_3d( - ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), - 0, - )); - let mut kcur = ctx0.op_cont(&ctx0.op_view_3d( - ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), - f32_size * n_embd / n_head, - )); - let mut vcur = ctx0.op_cont(&ctx0.op_view_3d( - ¤t, - (n_embd / n_head, n_head, n), - (nb / n_head, nb), - 2 * f32_size * n_embd / n_head, - )); - - // self-attention using mode = 2 for GPT-NeoX mode - qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2); - kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2); - - // store key and value to memory - vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, n_embd, n)); - - let k = ctx0.op_view_1d( - memory_k, - n * n_embd, - (memory_k_size * n_embd) * (il * n_ctx + n_past), - ); - - let v = ctx0.op_view_2d( - memory_v, - (n, n_embd), - n_ctx * memory_v_size, - (il * n_ctx) * memory_v_size * n_embd + n_past * memory_v_size, - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - let Q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - let K = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - memory_k, - (n_past + n) * n_embd, - il * n_ctx * memory_k_size * n_embd, - ), - n_embd / n_head, - n_head, - n_past + n, - ), - (0, 2, 1, 3), - ); - - // K * Q - let KQ = ctx0.op_mul_mat(&K, &Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let KQ_scaled = ctx0.op_scale_inplace( - &KQ, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { + let ctx0 = builder.ctx0; + let embd = builder.embd; + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + let (memory_k_size, memory_v_size) = ( + builder.memory_k.element_size(), + builder.memory_v.element_size(), ); - // KQ_masked = mask_past(KQ_scaled) - let KQ_masked = ctx0.op_diag_mask_inf_inplace(&KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - let KQ_softmax = ctx0.op_soft_max_inplace(&KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - let V = ctx0.op_view_3d( - memory_v, - (n_past + n, n_embd / n_head, n_head), - ( + let mut gf = ggml::ComputationGraph::new(n_threads); + + for il in 0..n_layer { + // attention uses first scratch buffer + builder.use_scratch(Some(0)); + + // self-attention + let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, ¤t), ¤t), + &ctx0.op_repeat(&self.layers[il].ln_1_b, ¤t), + ); + + // self-attention compute QKV + current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_attn_attn_b, ¤t), + ¤t, + ); + + let nb = current.get_nb()[1]; + let f32_size = std::mem::size_of::(); + + let mut qcur = ctx0.op_cont(&ctx0.op_view_3d( + ¤t, + (n_embd / n_head, n_head, n), + (nb / n_head, nb), + 0, + )); + let mut kcur = ctx0.op_cont(&ctx0.op_view_3d( + ¤t, + (n_embd / n_head, n_head, n), + (nb / n_head, nb), + f32_size * n_embd / n_head, + )); + let mut vcur = ctx0.op_cont(&ctx0.op_view_3d( + ¤t, + (n_embd / n_head, n_head, n), + (nb / n_head, nb), + 2 * f32_size * n_embd / n_head, + )); + + // self-attention using mode = 2 for GPT-NeoX mode + qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2); + kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2); + + // store key and value to memory + vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, n_embd, n)); + + let k = ctx0.op_view_1d( + builder.memory_k, + n * n_embd, + (memory_k_size * n_embd) * (il * n_ctx + n_past), + ); + + let v = ctx0.op_view_2d( + builder.memory_v, + (n, n_embd), n_ctx * memory_v_size, - n_ctx * memory_v_size * n_embd / n_head, - ), - il * n_ctx * memory_v_size * n_embd, - ); - - // KQV = transpose(V) * KQ_soft_max - let KQV = ctx0.op_mul_mat(&V, &KQ_softmax); - // KQV_merged = KQV.permute(0, 2, 1, 3) - let KQV_merged = ctx0.op_permute(&KQV, (0, 2, 1, 3)); - - // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy(&KQV_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); - - // self-attention projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); - current = ctx0.op_add( - &ctx0.op_repeat(&self.layers[il].c_attn_proj_b, ¤t), - ¤t, - ); - - // use the second scratch for the feed forward - ctx0.use_scratch(Some(&mut session.scratch[1])); - - let feedforward_input: Tensor; - if !use_parallel_residual { - feedforward_input = ctx0.op_add(¤t, &input_layer); - current = feed_forward_network(&ctx0, &self.layers[il], &feedforward_input); - // input for next layer - input_layer = ctx0.op_add(¤t, &feedforward_input); - } else { - // calculate with parallel residual - feedforward_input = current.share(); - - // this is independent of the self-attention result, so it could be done in parallel to the self-attention - // note here we pass inpL instead of cur - current = feed_forward_network(&ctx0, &self.layers[il], &input_layer); - - // layer input + FF - current = ctx0.op_add(¤t, &feedforward_input); - - // input for next layer - input_layer = ctx0.op_add(¤t, &input_layer); + (il * n_ctx) * memory_v_size * n_embd + n_past * memory_v_size, + ); + + gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + let Q = ctx0.op_permute(&qcur, (0, 2, 1, 3)); + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + let K = ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_k, + (n_past + n) * n_embd, + il * n_ctx * memory_k_size * n_embd, + ), + n_embd / n_head, + n_head, + n_past + n, + ), + (0, 2, 1, 3), + ); + + // K * Q + let KQ = ctx0.op_mul_mat(&K, &Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + let KQ_scaled = ctx0.op_scale_inplace( + &KQ, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); + + // KQ_masked = mask_past(KQ_scaled) + let KQ_masked = ctx0.op_diag_mask_inf_inplace(&KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + let KQ_softmax = ctx0.op_soft_max_inplace(&KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + let V = ctx0.op_view_3d( + builder.memory_v, + (n_past + n, n_embd / n_head, n_head), + ( + n_ctx * memory_v_size, + n_ctx * memory_v_size * n_embd / n_head, + ), + il * n_ctx * memory_v_size * n_embd, + ); + + // KQV = transpose(V) * KQ_soft_max + let KQV = ctx0.op_mul_mat(&V, &KQ_softmax); + // KQV_merged = KQV.permute(0, 2, 1, 3) + let KQV_merged = ctx0.op_permute(&KQV, (0, 2, 1, 3)); + + // cur = KQV_merged.contiguous().view(n_embd, N) + current = ctx0.op_cpy(&KQV_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); + + // self-attention projection + current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, ¤t); + current = ctx0.op_add( + &ctx0.op_repeat(&self.layers[il].c_attn_proj_b, ¤t), + ¤t, + ); + + // use the second scratch for the feed forward + builder.use_scratch(Some(1)); + + let feedforward_input: Tensor; + if !use_parallel_residual { + feedforward_input = ctx0.op_add(¤t, &input_layer); + current = feed_forward_network(ctx0, &self.layers[il], &feedforward_input); + // input for next layer + input_layer = ctx0.op_add(¤t, &feedforward_input); + } else { + // calculate with parallel residual + feedforward_input = current.share(); + + // this is independent of the self-attention result, so it could be done in parallel to the self-attention + // note here we pass inpL instead of cur + current = feed_forward_network(ctx0, &self.layers[il], &input_layer); + + // layer input + FF + current = ctx0.op_add(¤t, &feedforward_input); + + // input for next layer + input_layer = ctx0.op_add(¤t, &input_layer); + } } - } - // use the first scratch for the norm - ctx0.use_scratch(Some(&mut session.scratch[1])); + // use the first scratch for the norm + builder.use_scratch(Some(1)); - // normalize the output - input_layer = ctx0.op_norm(&input_layer); - // inpL = ln_f_g*inpL + ln_f_b - input_layer = ctx0.op_add( - &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), - &ctx0.op_repeat(&self.ln_f_b, &input_layer), - ); + // normalize the output + input_layer = ctx0.op_norm(&input_layer); + // inpL = ln_f_g*inpL + ln_f_b + input_layer = ctx0.op_add( + &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer), + &ctx0.op_repeat(&self.ln_f_b, &input_layer), + ); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + let embeddings_tensor: ggml::Tensor = input_layer.share(); - // Disable the scratchbuffer - ctx0.use_scratch(None); + // Disable the scratchbuffer + ctx0.use_scratch(None); - // apply language model head - input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); + // apply language model head + input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result: embeddings_tensor, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, n); - common::extract_logits(output_request, &input_layer, n_vocab, n); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, n); - common::update_session(session, &ctx0, input_tokens.len(), n); + common::read_last_token(session, &outputs.result, n_vocab, n); + common::extract_logits(output_request, &outputs.result, n_vocab, n); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n); } fn vocabulary(&self) -> &Vocabulary { diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 21d78fa6..d4abb2e1 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -1,13 +1,14 @@ //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem. #![deny(missing_docs)] -use std::error::Error; +use std::{error::Error, sync::Arc}; use llm_base::{ ggml, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - LoadError, Mmap, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, + Vocabulary, }; /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) @@ -33,8 +34,7 @@ pub struct Llama { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for Llama {} @@ -73,7 +73,7 @@ impl KnownModel for Llama { layers.push(layer); } - let (_context, _tensors, _mmap) = tl.finish(); + let (context, _tensors) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -85,8 +85,7 @@ impl KnownModel for Llama { norm, output, layers, - _context, - _mmap, + context: Arc::new(context), }) } @@ -123,203 +122,204 @@ impl KnownModel for Llama { file_type: _, } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let mut input_layer = ctx0.op_get_rows(&self.wte, &embd); - - let memory_k_size = session.memory_k.element_size(); - let memory_v_size = session.memory_v.element_size(); - - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - let mut gf = ggml::ComputationGraph::new( - if input_len >= 32 && ggml::cpu_has_blas() && !ggml::cpu_has_gpublas() { - 1 - } else { - num_threads - }, - ); - for il in 0..n_layer { - let input_self_attention = input_layer.share(); - let mut current: ggml::Tensor; - - ctx0.use_scratch(Some(&mut session.scratch[0])); - - // norm - current = ctx0.op_rms_norm(&input_layer); - - // cur = attention_norm * cur - current = ctx0.op_mul(¤t, &self.layers[il].attention_norm); - - // self-attention - // compute Q and K and RoPE them - let q_current = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, + let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { + let ctx0 = builder.ctx0; + let embd = builder.embd; + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + let mut gf = ggml::ComputationGraph::new( + if input_len >= 32 && ggml::cpu_has_blas() && !ggml::cpu_has_gpublas() { + 1 + } else { + num_threads + }, ); - ggml::set_name(&q_current, "Qcur"); - let k_current = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), - n_embd / n_head, - n_head, + for il in 0..n_layer { + let input_self_attention = input_layer.share(); + let mut current: ggml::Tensor; + + builder.use_scratch(Some(0)); + + // norm + current = ctx0.op_rms_norm(&input_layer); + + // cur = attention_norm * cur + current = ctx0.op_mul(¤t, &self.layers[il].attention_norm); + + // self-attention + // compute Q and K and RoPE them + let q_current = ctx0.op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), + n_embd / n_head, + n_head, + input_len, + ), + session_len, + n_rot, + 0, + ); + ggml::set_name(&q_current, "Qcur"); + let k_current = ctx0.op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), + n_embd / n_head, + n_head, + input_len, + ), + session_len, + n_rot, + 0, + ); + ggml::set_name(&k_current, "Kcur"); + + // store key and value to memory + // compute the transposed [N, n_embd] V matrix + let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d( + &ctx0.op_mul_mat(&self.layers[il].wv, ¤t), + n_embd, input_len, - ), - session_len, - n_rot, - 0, - ); - ggml::set_name(&k_current, "Kcur"); - - // store key and value to memory - // compute the transposed [N, n_embd] V matrix - let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d( - &ctx0.op_mul_mat(&self.layers[il].wv, ¤t), - n_embd, - input_len, - )); - - let k = ctx0.op_view_1d( - &session.memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - - let v = ctx0.op_view_2d( - &session.memory_v, - (input_len, n_embd), - ctx_size * memory_v_size, - (il * ctx_size) * memory_v_size * n_embd + session_len * memory_v_size, - ); - - // important: storing RoPE-ed version of K in the KV cache! - gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); - - let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); - ggml::set_name(&q, "Q"); - - let k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - &session.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, + )); + + let k = ctx0.op_view_1d( + builder.memory_k, + input_len * n_embd, + (builder.memory_k.element_size() * n_embd) * (il * ctx_size + session_len), + ); + + let v = ctx0.op_view_2d( + builder.memory_v, + (input_len, n_embd), + ctx_size * builder.memory_v.element_size(), + (il * ctx_size) * builder.memory_v.element_size() * n_embd + + session_len * builder.memory_v.element_size(), + ); + + // important: storing RoPE-ed version of K in the KV cache! + gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); + + let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); + ggml::set_name(&q, "Q"); + + let k = ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_k, + (session_len + input_len) * n_embd, + il * ctx_size * builder.memory_k.element_size() * n_embd, + ), + n_embd / n_head, + n_head, + session_len + input_len, ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - ggml::set_name(&k, "K"); - - // K * Q - let k_q = ctx0.op_mul_mat(&k, &q); - ggml::set_name(&k_q, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())); - ggml::set_name(&kq_scale, "1/sqrt(n_embd/n_head)"); - let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale); - ggml::set_name(&k_q_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len); - ggml::set_name(&k_q_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked); - ggml::set_name(&k_q_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - let v = ctx0.op_view_3d( - &session.memory_v, - (session_len + input_len, n_embd / n_head, n_head), - ( - ctx_size * memory_v_size, - ctx_size * memory_v_size * n_embd / n_head, - ), - il * ctx_size * memory_v_size * n_embd, - ); - ggml::set_name(&v, "V"); - - let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max); - ggml::set_name(&k_q_v, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); - ggml::set_name(&k_q_v_merged, "KQV_merged"); + (0, 2, 1, 3), + ); + ggml::set_name(&k, "K"); + + // K * Q + let k_q = ctx0.op_mul_mat(&k, &q); + ggml::set_name(&k_q, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())); + ggml::set_name(&kq_scale, "1/sqrt(n_embd/n_head)"); + let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale); + ggml::set_name(&k_q_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len); + ggml::set_name(&k_q_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked); + ggml::set_name(&k_q_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + let v = ctx0.op_view_3d( + builder.memory_v, + (session_len + input_len, n_embd / n_head, n_head), + ( + ctx_size * builder.memory_v.element_size(), + ctx_size * builder.memory_v.element_size() * n_embd / n_head, + ), + il * ctx_size * builder.memory_v.element_size() * n_embd, + ); + ggml::set_name(&v, "V"); - // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy( - &k_q_v_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - ggml::set_name(¤t, "KQV_merged_contiguous"); + let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max); + ggml::set_name(&k_q_v, "KQV"); - // projection (no bias) - current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); + // KQV_merged = KQV.permute(0, 2, 1, 3) + let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); + ggml::set_name(&k_q_v_merged, "KQV_merged"); - ctx0.use_scratch(Some(&mut session.scratch[1])); + // cur = KQV_merged.contiguous().view(n_embd, N) + current = ctx0.op_cpy( + &k_q_v_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + ); + ggml::set_name(¤t, "KQV_merged_contiguous"); - let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); + // projection (no bias) + current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); - // feed-forward network - // norm - current = ctx0.op_rms_norm(&input_feed_forward); + builder.use_scratch(Some(1)); - // cur = cur*ffn_norm(broadcasted) - current = ctx0.op_mul(¤t, &self.layers[il].ffn_norm); + let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); - let tmp = ctx0.op_mul_mat(&self.layers[il].w3, ¤t); + // feed-forward network + // norm + current = ctx0.op_rms_norm(&input_feed_forward); - current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); + // cur = cur*ffn_norm(broadcasted) + current = ctx0.op_mul(¤t, &self.layers[il].ffn_norm); - // SILU activation - current = ctx0.op_silu(¤t); + let tmp = ctx0.op_mul_mat(&self.layers[il].w3, ¤t); - current = ctx0.op_mul(¤t, &tmp); + current = ctx0.op_mul_mat(&self.layers[il].w1, ¤t); - current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); + // SILU activation + current = ctx0.op_silu(¤t); - current = ctx0.op_add(¤t, &input_feed_forward); + current = ctx0.op_mul(¤t, &tmp); - // input for next layer - input_layer = current; - } + current = ctx0.op_mul_mat(&self.layers[il].w2, ¤t); - ctx0.use_scratch(Some(&mut session.scratch[0])); + current = ctx0.op_add(¤t, &input_feed_forward); - // norm - input_layer = ctx0.op_rms_norm(&input_layer); + // input for next layer + input_layer = current; + } + builder.use_scratch(Some(0)); - // inpL = inpL*norm(broadcasted) - input_layer = ctx0.op_mul(&input_layer, &self.norm); + // norm + input_layer = ctx0.op_rms_norm(&input_layer); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + // inpL = inpL*norm(broadcasted) + input_layer = ctx0.op_mul(&input_layer, &self.norm); - // lm_head - input_layer = ctx0.op_mul_mat(&self.output, &input_layer); + let embedding_result: ggml::Tensor = input_layer.share(); - ctx0.use_scratch(None); + // lm_head + input_layer = ctx0.op_mul_mat(&self.output, &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ctx0.use_scratch(None); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, input_len); - common::extract_logits(output_request, &input_layer, n_vocab, input_len); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - common::update_session(session, &ctx0, input_tokens.len(), input_len); + common::read_last_token(session, &outputs.result, n_vocab, input_len); + common::extract_logits(output_request, &outputs.result, n_vocab, input_len); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } /// Returns the vocabulary used by this model. diff --git a/crates/models/mpt/Cargo.toml b/crates/models/mpt/Cargo.toml index 03e69fbe..9e2544e8 100644 --- a/crates/models/mpt/Cargo.toml +++ b/crates/models/mpt/Cargo.toml @@ -9,5 +9,4 @@ readme = "../../../README.md" [dependencies] llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - bytemuck = { workspace = true } diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 0ac4c33c..10ce78e9 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -1,12 +1,14 @@ //! An implementation of [MPT](https://huggingface.co/mosaicml) for the `llm` ecosystem. #![deny(missing_docs)] +use std::sync::Arc; + use ggml::Tensor; use llm_base::{ - ggml, + ggml::{self}, model::{common, HyperparametersWriteError}, - util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, - LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, }; /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b) @@ -30,8 +32,7 @@ pub struct Mpt { layers: Vec, // must be kept alive for the model - _context: ggml::Context, - _mmap: Option, + context: Arc, } unsafe impl Send for Mpt {} @@ -69,7 +70,7 @@ impl KnownModel for Mpt { layers.push(layer); } - let (_context, _, _mmap) = tl.finish(); + let (context, _) = tl.finish(); let ModelParameters { context_size, .. } = params; @@ -80,8 +81,7 @@ impl KnownModel for Mpt { wte, norm, layers, - _context, - _mmap, + context: Arc::new(context), }) } @@ -102,7 +102,7 @@ impl KnownModel for Mpt { input_tokens: &[TokenId], output_request: &mut OutputRequest, ) { - let input_len = input_tokens.len(); + let n = input_tokens.len(); let session_len = session.n_past; let num_threads = params.n_threads; let ctx_size = self.context_size; @@ -116,157 +116,159 @@ impl KnownModel for Mpt { .. } = self.hyperparameters; - let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); - - let mut input_layer = ctx0.op_get_rows(&self.wte, &embd); - - let f32_size = std::mem::size_of::(); - - let memory_k = &session.memory_k; - let memory_k_size = memory_k.element_size(); - - let memory_v = &session.memory_v; - let memory_v_size = memory_v.element_size(); - - let mut gf = ggml::ComputationGraph::new(num_threads); - for il in 0..n_layer { - // attention uses first scratch buffer - ctx0.use_scratch(Some(&mut session.scratch[0])); - - let mut current = ctx0.op_norm(&input_layer); - current = ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].norm_1_weight, ¤t), - ¤t, - ); - - current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); - - let nb = current.get_nb()[1]; - let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); - let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); - let vcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); - - let k = ctx0.op_view_1d( - memory_k, - input_len * n_embd, - (memory_k_size * n_embd) * (il * ctx_size + session_len), - ); - let v = ctx0.op_view_1d( - memory_v, - input_len * n_embd, - (memory_v_size * n_embd) * (il * ctx_size + session_len), - ); - - gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - let q = ctx0.op_permute( - &ctx0.op_cpy( - &qcur, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - ), - (0, 2, 1, 3), + let outputs = session.compute(self.context.clone(), input_tokens, |mut builder| { + let ctx0 = builder.ctx0; + let (memory_k_size, memory_v_size) = ( + builder.memory_k.element_size(), + builder.memory_v.element_size(), ); - - let bigk = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * memory_k_size * n_embd, + let embd = builder.embd; + + let mut input_layer = ctx0.op_get_rows(&self.wte, embd); + + let f32_size = std::mem::size_of::(); + + let mut gf = ggml::ComputationGraph::new(num_threads); + for il in 0..n_layer { + // attention uses first scratch buffer + builder.use_scratch(Some(0)); + + let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].norm_1_weight, ¤t), + ¤t, + ); + + current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); + + let nb = current.get_nb()[1]; + let qcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, 0); + let kcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd); + let vcur = ctx0.op_view_2d(¤t, (n_embd, n), nb, f32_size * n_embd * 2); + + let k = ctx0.op_view_1d( + builder.memory_k, + n * n_embd, + (memory_k_size * n_embd) * (il * ctx_size + session_len), + ); + let v = ctx0.op_view_1d( + builder.memory_v, + n * n_embd, + (memory_v_size * n_embd) * (il * ctx_size + session_len), + ); + + gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + + let q = ctx0.op_permute( + &ctx0.op_cpy( + &qcur, + &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n), ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); + (0, 2, 1, 3), + ); - let kq = ctx0.op_mul_mat(&bigk, &q); - let kq_scaled = ctx0.op_scale( - &kq, - &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - ); - let kq_scaled_alibi = ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); - let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); - let kq_softmax = ctx0.op_soft_max(&kq_masked); - - let v_trans = ctx0.op_cpy( - &ctx0.op_permute( + let bigk = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( - &session.memory_v, - (session_len + input_len) * n_embd, - il * ctx_size * memory_v_size * n_embd, + builder.memory_k, + (session_len + n) * n_embd, + il * ctx_size * memory_k_size * n_embd, ), n_embd / n_head, n_head, - session_len + input_len, + session_len + n, ), - (1, 2, 0, 3), - ), - &ctx0.new_tensor_3d( - session.memory_v.get_type(), - session_len + input_len, - n_embd / n_head, - n_head, - ), - ); + (0, 2, 1, 3), + ); + + let kq = ctx0.op_mul_mat(&bigk, &q); + let kq_scaled = ctx0.op_scale( + &kq, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); + let kq_scaled_alibi = + ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); + let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); + let kq_softmax = ctx0.op_soft_max(&kq_masked); + + let v_trans = ctx0.op_cpy( + &ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_v, + (session_len + n) * n_embd, + il * ctx_size * memory_v_size * n_embd, + ), + n_embd / n_head, + n_head, + session_len + n, + ), + (1, 2, 0, 3), + ), + &ctx0.new_tensor_3d( + builder.memory_v.get_type(), + session_len + n, + n_embd / n_head, + n_head, + ), + ); - let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); - let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); + let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - current = ctx0.op_cpy( - &kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - // projection - current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); + current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); + // projection + current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); - input_layer = ctx0.op_add(&input_layer, ¤t); + input_layer = ctx0.op_add(&input_layer, ¤t); - // feed forward uses second scratch buffer - ctx0.use_scratch(Some(&mut session.scratch[1])); + // feed forward uses second scratch buffer + builder.use_scratch(Some(1)); - current = ctx0.op_norm(&input_layer); - current = ctx0.op_mul( - &ctx0.op_repeat(&self.layers[il].norm_2_weight, ¤t), - ¤t, - ); + current = ctx0.op_norm(&input_layer); + current = ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].norm_2_weight, ¤t), + ¤t, + ); - current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); + current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); - current = ctx0.op_gelu(¤t); + current = ctx0.op_gelu(¤t); - // projection - current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); + // projection + current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); - input_layer = ctx0.op_add(&input_layer, ¤t); - } + input_layer = ctx0.op_add(&input_layer, ¤t); + } - //use scratch buffer 0 for the rest - ctx0.use_scratch(Some(&mut session.scratch[0])); + //use scratch buffer 0 for the rest + builder.use_scratch(Some(0)); - // norm - input_layer = ctx0.op_norm(&input_layer); - input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); + // norm + input_layer = ctx0.op_norm(&input_layer); + input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); - let embeddings_tensor: ggml::Tensor = input_layer.share(); + let embeddings_tensor: ggml::Tensor = input_layer.share(); - // disable scratch buffer for last layer - ctx0.use_scratch(None); - // output embedding weight tied to input embedding - input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); + // disable scratch buffer for last layer + ctx0.use_scratch(None); + // output embedding weight tied to input embedding + input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); - // run the computation - gf.build_forward_expand(&input_layer); - ctx0.graph_compute(&mut gf); + ( + gf, + GraphOutputs { + result: input_layer, + embedding_result: embeddings_tensor, + }, + ) + }); // finish evaluation - common::read_last_token(session, &input_layer, n_vocab, input_len); - common::extract_logits(output_request, &input_layer, n_vocab, input_len); - common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - common::update_session(session, &ctx0, input_tokens.len(), input_len); + common::read_last_token(session, &outputs.result, n_vocab, n); + common::extract_logits(output_request, &outputs.result, n_vocab, n); + common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n); } /// Returns the vocabulary used by this model. diff --git a/doc/CONTRIBUTING.md b/doc/CONTRIBUTING.md index 4df639bb..a29d11b5 100644 --- a/doc/CONTRIBUTING.md +++ b/doc/CONTRIBUTING.md @@ -32,7 +32,7 @@ cargo run --release --package generate-ggml-bindings ## Acceleration Support for Building -The `ggml-sys` crate includes various acceleration backends, selectable via `--features` flags. The availability of supported backends varies by platform, and `ggml-sys` can only be built with a single active acceleration backend at a time. If cublas and clblast are both specified, cublas is prioritized and clblast is ignored. +The `ggml-sys` crate includes various acceleration backends, selectable via `--features` flags. The availability of supported backends varies by platform, and `ggml-sys` can only be built with a single active acceleration backend at a time. If cublas and clblast are both specified, cublas is prioritized and clblast is ignored. | Platform/OS | `cublas` | `clblast` | `metal` | | ----------- | ------------------ | ------------------ | ------------------ | @@ -62,7 +62,6 @@ set OPENCL_PATH=....\vcpkg\packages\opencl_x64-windows set CLBLAST_PATH=....\vcpkg\packages\clblast_x64-windows ``` - ⚠️ When working with MSVC in a Windows environment, it is essential to set the `-Ctarget-feature=+crt-static` Rust flag. This flag is critical as it enables the static linking of the C runtime, which can be paramount for certain deployment scenarios or specific runtime environments. To set this flag, you can modify the .cargo\config file in your project directory. Please add the following configuration snippet: @@ -76,7 +75,6 @@ This will ensure the Rust flag is appropriately set for your compilation process For a comprehensive guide on the usage of Rust flags, including other possible ways to set them, please refer to this detailed [StackOverflow discussion](https://stackoverflow.com/questions/38040327/how-to-pass-rustc-flags-to-cargo). Make sure to choose an option that best fits your project requirements and development environment. - ⚠️ For `llm` to function properly, it requires the `clblast.dll` and `OpenCL.dll` files. These files can be found within the `bin` subdirectory of their respective vcpkg packages. There are two options to ensure `llm` can access these files: 1. Amend your `PATH` environment variable to include the `bin` directories of each respective package. @@ -85,8 +83,6 @@ For a comprehensive guide on the usage of Rust flags, including other possible w Please choose the option that best suits your needs and environment configuration. - - ### Linux #### CuBLAS @@ -103,6 +99,16 @@ CLBlast can be installed on Linux through various package managers. For example, Xcode and the associated command-line tools should be installed on your system, and you should be running a version of MacOS that supports Metal. For more detailed information, please consult the [official Metal documentation](https://developer.apple.com/metal/). +To enable Metal using the CLI, ensure it was built successfully using `--features=metal` and then pass the `--use-gpu` flag. + +The current underlying implementation of Metal in GGML is still in flux and has some limitations: + +- Metal for GGML requires the `ggml-metal.metal` file to be located in the same directory as the binary (i.e., `target/release/`). In future versions, this will likely be embedded in the binary itself. +- Evaluating a model with more than one token at a time is not currently supported in GGML's Metal implementation. An `llm` inference session will fall back to the CPU implementation (typically during the 'feed prompt' phase) but will automatically use the GPU once a single token is passed per evaluation (typically after prompt feeding). +- Not all model architectures will be equally stable when used with Metal due to ongoing work in the underlying implementation. Expect `llama` models to work fine though. +- With Metal, it is possible but not required to use `mmap`. As buffers do not need to be copied to VRAM on M1, `mmap` is the most efficient however. +- Debug messages may be logged by the underlying GGML Metal implementation. This will likely go away in the future for release builds of `llm`. + ## Debugging This repository includes a [`launch.json` file](../.vscode/launch.json) that can diff --git a/doc/known-good-models.md b/doc/known-good-models.md index 4f91fe68..162f9db5 100644 --- a/doc/known-good-models.md +++ b/doc/known-good-models.md @@ -9,6 +9,7 @@ but this work is ongoing. - : note that this is `f16`-only and we recommend you quantize it using `llm` for best performance. +- ## GPT-J