From 7d71e06b202418a916b4043f493e52edd5a976d5 Mon Sep 17 00:00:00 2001 From: alex-lew Date: Tue, 16 Jul 2024 18:49:28 +0000 Subject: [PATCH] Use black code formatter --- examples/grammar_constraint.py | 1 + examples/haiku.py | 43 ++- examples/hard_constraints.py | 43 +-- hfppl/__init__.py | 2 +- hfppl/chunks.py | 31 +- hfppl/distributions/__init__.py | 2 +- hfppl/distributions/bernoulli.py | 12 +- hfppl/distributions/distribution.py | 15 +- hfppl/distributions/geometric.py | 15 +- hfppl/distributions/lmcontext.py | 125 +++++---- hfppl/distributions/logcategorical.py | 9 +- hfppl/distributions/tokencategorical.py | 27 +- hfppl/distributions/transformer.py | 26 +- hfppl/inference/__init__.py | 2 +- hfppl/inference/smc_standard.py | 28 +- hfppl/inference/smc_steer.py | 29 +- hfppl/llms.py | 359 +++++++++++++++--------- hfppl/modeling.py | 116 ++++---- hfppl/util.py | 11 +- 19 files changed, 545 insertions(+), 351 deletions(-) diff --git a/examples/grammar_constraint.py b/examples/grammar_constraint.py index d3f28ba..965d3b3 100644 --- a/examples/grammar_constraint.py +++ b/examples/grammar_constraint.py @@ -9,6 +9,7 @@ Requires synchromesh (github.com/kanishkg/synchromesh) """ + import asyncio import os from typing import List diff --git a/examples/haiku.py b/examples/haiku.py index c9665cb..91016be 100644 --- a/examples/haiku.py +++ b/examples/haiku.py @@ -4,26 +4,37 @@ import os # download the CMU pronunciation dictionary (if we haven't already) -nltk.download('cmudict') +nltk.download("cmudict") # Load the CMU pronunciation dictionary and use it for syllable counting from nltk.corpus import cmudict + CMUDICT = cmudict.dict() + def count_syllables(word, unknown_word_syllables=100): - + # Use the dictionary to get the list of possible phonetic representations for the word phonetic_transcriptions = CMUDICT.get(word.strip().lower(), []) - + # Count the number of syllables based on the number of phonetic transcriptions - syllable_count = min([len([ph for ph in transcription if ph[-1].isdigit()]) for transcription in phonetic_transcriptions], default=unknown_word_syllables) + syllable_count = min( + [ + len([ph for ph in transcription if ph[-1].isdigit()]) + for transcription in phonetic_transcriptions + ], + default=unknown_word_syllables, + ) return syllable_count + # Load the language model (llama2 if authorized, else mistral-7b). -if 'HF_AUTH_TOKEN' in os.environ: - HF_AUTH_TOKEN = os.environ['HF_AUTH_TOKEN'] - LLM = CachedCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", auth_token=HF_AUTH_TOKEN) +if "HF_AUTH_TOKEN" in os.environ: + HF_AUTH_TOKEN = os.environ["HF_AUTH_TOKEN"] + LLM = CachedCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", auth_token=HF_AUTH_TOKEN + ) else: LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") @@ -74,21 +85,22 @@ def count_syllables(word, unknown_word_syllables=100): # Useful constants NEWLINE_TOKEN, EOS_TOKEN = 13, LLM.tokenizer.eos_token_id + # LLaMPPL model class Haiku(Model): - + def __init__(self, prompt, syllable_pattern=[5, 7, 5]): super().__init__() self.context = LMContext(LLM, prompt, 0.7) self.syllable_pattern = syllable_pattern - + async def step(self): # Get the number of syllables required in the next line syllables_remaining = self.syllable_pattern.pop(0) - + # Loop to sample words until this line is over while syllables_remaining > 0: - + # Sample a word word, punctuation = await self.call(sample_word(self.context)) @@ -103,18 +115,19 @@ async def step(self): await self.observe(self.context.next_token(), EOS_TOKEN) self.finish() return - + # Otherwise, observe a line break await self.observe(self.context.next_token(), NEWLINE_TOKEN) # Print current result print(str(self.context)) + # Run inference -SYLLABLES_PER_LINE = [5, 7, 5] # [5, 3, 5] for a Lune +SYLLABLES_PER_LINE = [5, 7, 5] # [5, 3, 5] for a Lune particles = asyncio.run(smc_standard(Haiku(poem_prompt, SYLLABLES_PER_LINE), 120)) print("--------") -for (i,particle) in enumerate(particles): +for i, particle in enumerate(particles): print(f"Poem {i} (weight {particle.weight}):") - print(f"{particle.context}") \ No newline at end of file + print(f"{particle.context}") diff --git a/examples/hard_constraints.py b/examples/hard_constraints.py index 1feb848..75fce14 100644 --- a/examples/hard_constraints.py +++ b/examples/hard_constraints.py @@ -4,22 +4,30 @@ import os -if 'HF_AUTH_TOKEN' in os.environ: - HF_AUTH_TOKEN = os.environ['HF_AUTH_TOKEN'] +if "HF_AUTH_TOKEN" in os.environ: + HF_AUTH_TOKEN = os.environ["HF_AUTH_TOKEN"] -# Load the language model. +# Load the language model. # Mistral and Vicuna are open models; to use a model with restricted access, like LLaMA 2, # pass your HuggingFace API key as the optional `auth_token` argument: # LLM = CachedCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", auth_token=HF_AUTH_TOKEN) -# LLM = CachedCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5") -LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +LLM = CachedCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5") +# LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") LLM.batch_size = 40 -MASKS = {i : set(j for (j,v) in enumerate(LLM.vocab) - if j != LLM.tokenizer.eos_token_id and '\n' not in v and - any(c.isalpha() or c in string.punctuation for c in v) and - len(v.strip()) <= 5 and (not v[0].isalpha() or i+len(v) <= 5)) - for i in range(6)} +MASKS = { + i: set( + j + for (j, v) in enumerate(LLM.vocab) + if j != LLM.tokenizer.eos_token_id + and "\n" not in v + and any(c.isalpha() or c in string.punctuation for c in v) + and len(v.strip()) <= 5 + and (not v[0].isalpha() or i + len(v) <= 5) + ) + for i in range(6) +} + class ConstraintModel(Model): def __init__(self, prompt, max_tokens): @@ -33,26 +41,27 @@ async def step(self): # Condition on next token being from mask await self.observe(self.context.mask_dist(mask), True) - + # Generate proposed token. token = await self.sample(self.context.next_token()) - + # Reduce number of max tokens remaining self.max_tokens -= 1 - + print(f"{self.context}") # Check if done if token == LLM.tokenizer.eos_token_id or self.max_tokens == 0: self.finish() return - + def active_constraint_mask(self): string_so_far = str(self.context) words = string_so_far.split() last_word = words[-1] if len(words) > 0 else "" return MASKS[min(5, len(last_word))] - + + # From Politico.com prompt = """3 things to watch … @@ -64,10 +73,12 @@ def active_constraint_mask(self): LLM.cache_kv(LLM.tokenizer.encode(prompt)) + async def main(): constraint_model = ConstraintModel(prompt, 50) particles = await smc_standard(constraint_model, 40) for p in particles: print(f"{p.context}") -asyncio.run(main()) \ No newline at end of file + +asyncio.run(main()) diff --git a/hfppl/__init__.py b/hfppl/__init__.py index aa8da01..ec1651a 100644 --- a/hfppl/__init__.py +++ b/hfppl/__init__.py @@ -6,4 +6,4 @@ from .distributions import * from .modeling import * from .inference import * -from .chunks import * \ No newline at end of file +from .chunks import * diff --git a/hfppl/chunks.py b/hfppl/chunks.py index 1635702..663152c 100644 --- a/hfppl/chunks.py +++ b/hfppl/chunks.py @@ -1,38 +1,51 @@ import string from .modeling import submodel + @submodel async def sample_word(self, context, max_tokens=5, allow_punctuation=True): """Sample a word from the `LMContext` object `context`.""" last_token = context.lm.vocab[context.tokens[-1]] if len(context.tokens) > 0 else "" last_character = last_token[-1] if len(last_token) > 0 else "" - needs_space = last_character not in string.whitespace and last_character not in ['-', "'", '"'] + needs_space = last_character not in string.whitespace and last_character not in [ + "-", + "'", + '"', + ] if needs_space: starts_word_mask = context.lm.masks.STARTS_NEW_WORD else: starts_word_mask = context.lm.masks.CONTINUES_CURRENT_WORD - + # Force model to start a new word await self.observe(context.mask_dist(starts_word_mask), True) word = "" num_tokens = 0 while True: - token = await self.sample(context.next_token()) - word += context.lm.vocab[token.token_id] + token = await self.sample(context.next_token()) + word += context.lm.vocab[token.token_id] num_tokens += 1 if num_tokens == max_tokens: - await self.observe(context.mask_dist(context.lm.masks.CONTINUES_CURRENT_WORD), False) + await self.observe( + context.mask_dist(context.lm.masks.CONTINUES_CURRENT_WORD), False + ) break - if not (await self.sample(context.mask_dist(context.lm.masks.CONTINUES_CURRENT_WORD))): + if not ( + await self.sample( + context.mask_dist(context.lm.masks.CONTINUES_CURRENT_WORD) + ) + ): break - + # Sample punctuation, if desired punctuation = "" - if allow_punctuation and await self.sample(context.mask_dist(context.lm.masks.PUNCTUATION)): + if allow_punctuation and await self.sample( + context.mask_dist(context.lm.masks.PUNCTUATION) + ): punctuation_token = await self.sample(context.next_token()) punctuation = context.lm.vocab[punctuation_token.token_id] - return word, punctuation \ No newline at end of file + return word, punctuation diff --git a/hfppl/distributions/__init__.py b/hfppl/distributions/__init__.py index d80078d..f87cdf0 100644 --- a/hfppl/distributions/__init__.py +++ b/hfppl/distributions/__init__.py @@ -17,4 +17,4 @@ from .tokencategorical import TokenCategorical from .transformer import Transformer from .lmcontext import LMContext -from .bernoulli import Bernoulli \ No newline at end of file +from .bernoulli import Bernoulli diff --git a/hfppl/distributions/bernoulli.py b/hfppl/distributions/bernoulli.py index 0d9028a..fed945b 100644 --- a/hfppl/distributions/bernoulli.py +++ b/hfppl/distributions/bernoulli.py @@ -2,13 +2,13 @@ import numpy as np + class Bernoulli(Distribution): - """A Bernoulli distribution. - """ - + """A Bernoulli distribution.""" + def __init__(self, p): """Create a Bernoulli distribution. - + Args: p: the probability-of-True for the Bernoulli distribution. """ @@ -20,6 +20,6 @@ async def sample(self): async def log_prob(self, value): return np.log(self.p) if value else np.log1p(-self.p) - + async def argmax(self, idx): - return ((self.p > 0.5) if idx == 0 else (self.p < 0.5)) \ No newline at end of file + return (self.p > 0.5) if idx == 0 else (self.p < 0.5) diff --git a/hfppl/distributions/distribution.py b/hfppl/distributions/distribution.py index 5296da7..e063b96 100644 --- a/hfppl/distributions/distribution.py +++ b/hfppl/distributions/distribution.py @@ -1,29 +1,28 @@ class Distribution: """Abstract base class for a distribution.""" - async def sample(self): """Generate a random sample from the distribution. - + Returns: x: a value randomly sampled from the distribution.""" raise NotImplementedError() - + async def log_prob(self, x): """Compute the log probability of a value under this distribution, or the log probability density if the distribution is continuous. - + Args: x: the point at which to evaluate the log probability. Returns: - logprob (float): the log probability of `x`.""" + logprob (float): the log probability of `x`.""" raise NotImplementedError() - + async def argmax(self, n): """Return the nth most probable outcome under this distribution (assuming this is a discrete distribution). - + Args: n (int): which value to return to, indexed from most probable (n=0) to least probable (n=|support|). Returns: x: the nth most probable outcome from this distribution.""" - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() diff --git a/hfppl/distributions/geometric.py b/hfppl/distributions/geometric.py index aa2be13..6ebf5e8 100644 --- a/hfppl/distributions/geometric.py +++ b/hfppl/distributions/geometric.py @@ -1,12 +1,13 @@ from .distribution import Distribution +import numpy as np + class Geometric(Distribution): - """A Geometric distribution. - """ - + """A Geometric distribution.""" + def __init__(self, p): """Create a Geometric distribution. - + Args: p: the rate of the Geometric distribution. """ @@ -17,7 +18,7 @@ async def sample(self): return n, await self.log_prob(n) async def log_prob(self, value): - return np.log(self.p) + np.log(1 - self.p)*(value - 1) - + return np.log(self.p) + np.log(1 - self.p) * (value - 1) + async def argmax(self, idx): - return idx - 1 # Most likely outcome is 0, then 1, etc. \ No newline at end of file + return idx - 1 # Most likely outcome is 0, then 1, etc. diff --git a/hfppl/distributions/lmcontext.py b/hfppl/distributions/lmcontext.py index 8400066..b2971fe 100644 --- a/hfppl/distributions/lmcontext.py +++ b/hfppl/distributions/lmcontext.py @@ -4,23 +4,24 @@ import numpy as np import copy + class LMNextToken(Distribution): - + def __init__(self, ctx): self.ctx = ctx - + async def log_prob(self, x): if isinstance(x, Token): x = x.token_id - + lp = self.ctx.next_token_logprobs[x] self.ctx.tokens.append(x) updated_logprobs = await self.ctx.lm.next_token_logprobs(self.ctx.tokens) self.ctx.next_token_logprobs = log_softmax(updated_logprobs / self.ctx.temp) self.ctx.model_mask = self.ctx.lm.masks.ALL_TOKENS - + return lp - + async def sample(self): probs = np.exp(self.ctx.next_token_logprobs) token_id = np.random.choice(len(probs), p=(probs)) @@ -28,71 +29,78 @@ async def sample(self): logprob = self.ctx.next_token_logprobs[token_id] # Reset mask and update logprobs - self.ctx.model_mask = self.ctx.lm.masks.ALL_TOKENS - updated_logprobs = await self.ctx.lm.next_token_logprobs(self.ctx.tokens) + self.ctx.model_mask = self.ctx.lm.masks.ALL_TOKENS + updated_logprobs = await self.ctx.lm.next_token_logprobs(self.ctx.tokens) self.ctx.next_token_logprobs = log_softmax(updated_logprobs / self.ctx.temp) - t = Token(self.ctx.lm, token_id, self.ctx.lm.tokenizer.convert_ids_to_tokens(token_id)) + t = Token( + self.ctx.lm, token_id, self.ctx.lm.tokenizer.convert_ids_to_tokens(token_id) + ) return t, logprob - + + class LMTokenMask(Distribution): def __init__(self, ctx, mask): - self.ctx = ctx + self.ctx = ctx self.mask = mask - + async def sample(self): - newly_bad_tokens = [i for i in self.ctx.model_mask if i not in self.mask] - good_tokens = [i for i in self.ctx.model_mask if i in self.mask] - logprob_no_mask = logsumexp(self.ctx.next_token_logprobs[newly_bad_tokens]) + newly_bad_tokens = [i for i in self.ctx.model_mask if i not in self.mask] + good_tokens = [i for i in self.ctx.model_mask if i in self.mask] + logprob_no_mask = logsumexp(self.ctx.next_token_logprobs[newly_bad_tokens]) if logprob_no_mask > 0: - logprob_yes_mask = float('-inf') + logprob_yes_mask = float("-inf") else: # When logprob_no_mask is very close to 0.0, np.log1p can raise a "divide by zero" # warning before returning -inf. We suppress this warning, because returning -inf # is the desired behavior (the LLM places no mass on 'yes'). - with np.errstate(divide='ignore'): - logprob_yes_mask = np.log1p(-np.exp(logprob_no_mask)) - decide_no_mask = np.random.rand() < np.exp(logprob_no_mask) + with np.errstate(divide="ignore"): + logprob_yes_mask = np.log1p(-np.exp(logprob_no_mask)) + decide_no_mask = np.random.rand() < np.exp(logprob_no_mask) if decide_no_mask: self.ctx.model_mask = self.ctx.model_mask - self.mask - self.ctx.next_token_logprobs[good_tokens] = float('-inf') + self.ctx.next_token_logprobs[good_tokens] = float("-inf") self.ctx.next_token_logprobs -= logprob_no_mask return False, logprob_no_mask else: self.ctx.model_mask = self.ctx.model_mask.intersection(self.mask) - self.ctx.next_token_logprobs[newly_bad_tokens] = float('-inf') + self.ctx.next_token_logprobs[newly_bad_tokens] = float("-inf") self.ctx.next_token_logprobs -= logprob_yes_mask return True, logprob_yes_mask - + async def log_prob(self, v): - good_tokens = self.ctx.model_mask.intersection(self.mask) if v else self.ctx.model_mask - self.mask - bad_tokens = [i for i in self.ctx.model_mask if i not in good_tokens] + good_tokens = ( + self.ctx.model_mask.intersection(self.mask) + if v + else self.ctx.model_mask - self.mask + ) + bad_tokens = [i for i in self.ctx.model_mask if i not in good_tokens] logprob_good = logsumexp(self.ctx.next_token_logprobs[list(good_tokens)]) - self.ctx.next_token_logprobs[bad_tokens] = float('-inf') + self.ctx.next_token_logprobs[bad_tokens] = float("-inf") self.ctx.next_token_logprobs -= logprob_good self.ctx.model_mask = good_tokens return logprob_good - - + + class LMContext: """Represents a generation-in-progress from a language model. - + The state tracks two pieces of information: - + * A sequence of tokens — the ever-growing context for the language model. * A *current mask* — a set of tokens that have not yet been ruled out as the next token. - + Storing a mask enables _sub-token_ generation: models can use `LMContext` to sample the next token in _stages_, first deciding, e.g., whether to use an upper-case or lower-case first letter, and only later deciding which upper-case or lower-case token to generate. - + The state of a `LMContext` can be advanced in two ways: - + 1. Sampling, observing, or intervening the `next_token()` distribution. This causes a token to be added to the growing sequence of tokens. Supports auto-batching. 2. Sampling, observing, or intervening the `mask_dist(mask)` distribution for a given mask (set of token ids). This changes the current mask. - + Attributes: lm (hfppl.llms.CachedCausalLM): the language model for which this is a context tokens (list[int]): the underlying sequence of tokens, including prompt, in this context @@ -101,52 +109,57 @@ class LMContext: model_mask (set[int]): set of tokens that have not been ruled out as the next token. This mask is managed by the `LMContext` object internally; do not mutate. show_prompt (bool): controls whether the string representation of this `LMContext` includes the initial prompt or not. Defaults to `False`. """ - + def __init__(self, lm, prompt, temp=1.0): """Create a new `LMContext` with a given prompt and temperature. - + Args: lm (hfppl.llms.CachedCausalLM): the language model for which this is a context. prompt (str): a string with which to initialize the context. Will be tokenized using `lm.tokenizer`. - temp (float): temeprature for next-token distribution (0 < temp < float('inf'))""" - self.lm = lm - self.tokens = lm.tokenizer.encode(prompt) - self.next_token_logprobs = log_softmax(lm.next_token_logprobs_unbatched(self.tokens) / temp) - self.temp = temp - self.model_mask = lm.masks.ALL_TOKENS + temp (float): temeprature for next-token distribution (0 < temp < float('inf')) + """ + self.lm = lm + self.tokens = lm.tokenizer.encode(prompt) + self.next_token_logprobs = log_softmax( + lm.next_token_logprobs_unbatched(self.tokens) / temp + ) + self.temp = temp + self.model_mask = lm.masks.ALL_TOKENS self.prompt_string_length = len(lm.tokenizer.decode(self.tokens)) - self.show_prompt = False - + self.show_prompt = False + def next_token(self): """Distribution over the next token. - - Sampling or observing from this distribution advances the state of this `LMContext` instance.""" + + Sampling or observing from this distribution advances the state of this `LMContext` instance. + """ return LMNextToken(self) - + def mask_dist(self, mask): """Bernoulli distribution, with probability of True equal to the probability that the next token of this `LMContext` belongs to the given mask. - + Sampling or observing from this distribution modifies the state of this `LMContext` instance, so that the `next_token()` distribution either *will* (if True) or *will not* (if False) generate a token from the given mask. - + Args: - mask: a `set(int)` specifying which token ids are included within the mask.""" - return LMTokenMask(self, mask) - + mask: a `set(int)` specifying which token ids are included within the mask. + """ + return LMTokenMask(self, mask) + def __str__(self): base = 0 if self.show_prompt else self.prompt_string_length full_string = self.lm.tokenizer.decode(self.tokens) return full_string[base:] - - def __deepcopy__(self, memo): + + def __deepcopy__(self, memo): cpy = type(self).__new__(type(self)) - + for k, v in self.__dict__.items(): - if k in set(['lm']): + if k in set(["lm"]): setattr(cpy, k, v) else: setattr(cpy, k, copy.deepcopy(v, memo)) - - return cpy \ No newline at end of file + + return cpy diff --git a/hfppl/distributions/logcategorical.py b/hfppl/distributions/logcategorical.py index 5dcd16c..3756eec 100644 --- a/hfppl/distributions/logcategorical.py +++ b/hfppl/distributions/logcategorical.py @@ -1,13 +1,14 @@ from .distribution import Distribution + class LogCategorical(Distribution): """A Geometric distribution.""" def __init__(self, logits): - """Create a Categorical distribution from unnormalized log probabilities (logits). + """Create a Categorical distribution from unnormalized log probabilities (logits). Given an array of logits, takes their `softmax` and samples an integer in `range(len(logits))` from the resulting categorical. - + Args: logits (np.array): a numpy array of unnormalized log probabilities. """ @@ -19,6 +20,6 @@ async def sample(self): async def log_prob(self, value): return self.log_probs[value] - + async def argmax(self, idx): - return np.argsort(self.log_probs)[-idx] \ No newline at end of file + return np.argsort(self.log_probs)[-idx] diff --git a/hfppl/distributions/tokencategorical.py b/hfppl/distributions/tokencategorical.py index e3fe275..7f109f8 100644 --- a/hfppl/distributions/tokencategorical.py +++ b/hfppl/distributions/tokencategorical.py @@ -3,29 +3,38 @@ from ..llms import Token import numpy as np + class TokenCategorical(Distribution): - def __init__(self, lm, logits): - """Create a Categorical distribution whose values are Tokens, not integers. - Given a language model `lm` and an array of unnormalized log probabilities (of length `len(lm.vocab)`), + def __init__(self, lm, logits): + """Create a Categorical distribution whose values are Tokens, not integers. + Given a language model `lm` and an array of unnormalized log probabilities (of length `len(lm.vocab)`), uses softmax to normalize them and samples a Token from the resulting categorical. - + Args: lm (hfppl.llms.CachedCausalLM): the language model whose vocabulary is to be generated from. logits (np.array): a numpy array of unnormalized log probabilities. """ - self.lm = lm + self.lm = lm self.log_probs = log_softmax(logits) if self.lm.tokenizer.vocab_size != len(logits): - raise RuntimeError(f"TokenCategorical: vocab size is {self.lm.tokenizer.vocab_size} but provided {len(logits)} logits.") + raise RuntimeError( + f"TokenCategorical: vocab size is {self.lm.tokenizer.vocab_size} but provided {len(logits)} logits." + ) async def sample(self): n = np.random.choice(len(self.log_probs), p=(np.exp(self.log_probs))) - return Token(self.lm, n, self.lm.tokenizer.convert_ids_to_tokens(n)), self.log_probs[n] + return ( + Token(self.lm, n, self.lm.tokenizer.convert_ids_to_tokens(n)), + self.log_probs[n], + ) async def log_prob(self, value): return self.log_probs[value.token_id] - + async def argmax(self, idx): tok = torch.argsort(self.log_probs)[-idx] - return Token(self.lm, tok, self.lm.tokenizer.convert_ids_to_tokens(tok)), self.log_probs[tok] \ No newline at end of file + return ( + Token(self.lm, tok, self.lm.tokenizer.convert_ids_to_tokens(tok)), + self.log_probs[tok], + ) diff --git a/hfppl/distributions/transformer.py b/hfppl/distributions/transformer.py index 32d05db..9fd834f 100644 --- a/hfppl/distributions/transformer.py +++ b/hfppl/distributions/transformer.py @@ -2,13 +2,14 @@ from ..llms import TokenSequence, Token import numpy as np + # Transformer(lm, prompt) -- where prompt can either be a string or a list of Tokens. class Transformer(Distribution): def __init__(self, lm, prompt, temp=1.0): """Create a Categorical distribution whose values are Tokens, with probabilities given by a language model. Supports auto-batching. - + Args: lm (hfppl.llms.CachedCausalLM): the language model. prompt (str | hfppl.llms.TokenSequence): the sequence of tokens to use as the prompt. If a string, `lm.tokenizer` is used to encode it. @@ -16,33 +17,36 @@ def __init__(self, lm, prompt, temp=1.0): """ self.lm = lm self.temp = temp - + # prompt will be a list of ints if isinstance(prompt, str): prompt = self.lm.tokenizer.encode(prompt) elif isinstance(prompt, TokenSequence): prompt = prompt.seq - + self.prompt = prompt - - + async def log_prob(self, x): log_probs = await self.lm.next_token_logprobs(self.prompt) log_probs = log_probs / self.temp - + if isinstance(x, Token): x = x.token_id - + return log_probs[x] - + async def sample(self): log_probs = await self.lm.next_token_logprobs(self.prompt) log_probs = log_probs / self.temp probs = np.exp(log_probs) token_id = np.random.choice(len(probs), p=(probs)) logprob = log_probs[token_id] - return Token(self.lm, token_id, self.lm.tokenizer.convert_ids_to_tokens(token_id)), logprob - + return ( + Token(self.lm, token_id, self.lm.tokenizer.convert_ids_to_tokens(token_id)), + logprob, + ) + + # def argmax(self, idx): # token_id = np.argsort(self.log_probs)[-idx] -# return Token(self.lm, token_id, self.lm.tokenizer.convert_ids_to_tokens(token_id)), log_probs[token_id] \ No newline at end of file +# return Token(self.lm, token_id, self.lm.tokenizer.convert_ids_to_tokens(token_id)), log_probs[token_id] diff --git a/hfppl/inference/__init__.py b/hfppl/inference/__init__.py index 432e969..0dbda60 100644 --- a/hfppl/inference/__init__.py +++ b/hfppl/inference/__init__.py @@ -7,5 +7,5 @@ * `smc_steer(model, num_beams, num_expansions)`: a without-replacement SMC algorithm that resembles beam search. """ -from .smc_standard import smc_standard +from .smc_standard import smc_standard from .smc_steer import smc_steer diff --git a/hfppl/inference/smc_standard.py b/hfppl/inference/smc_standard.py index 941fb2f..10500be 100644 --- a/hfppl/inference/smc_standard.py +++ b/hfppl/inference/smc_standard.py @@ -3,39 +3,47 @@ import numpy as np import asyncio + async def smc_standard(model, n_particles, ess_threshold=0.5): """ Standard sequential Monte Carlo algorithm with multinomial resampling. - + Args: model (hfppl.modeling.Model): The model to perform inference on. n_particles (int): Number of particles to execute concurrently. ess_threshold (float): Effective sample size below which resampling is triggered, given as a fraction of `n_particles`. - + Returns: particles (list[hfppl.modeling.Model]): The completed particles after inference. """ particles = [copy.deepcopy(model) for _ in range(n_particles)] weights = [0.0 for _ in range(n_particles)] - - while (any(map(lambda p: not p.done_stepping(), particles))): + + while any(map(lambda p: not p.done_stepping(), particles)): # Step each particle for p in particles: p.untwist() await asyncio.gather(*[p.step() for p in particles if not p.done_stepping()]) - + # Normalize weights W = np.array([p.weight for p in particles]) w_sum = logsumexp(W) normalized_weights = W - w_sum - + # Resample if necessary - if -logsumexp(normalized_weights * 2) < np.log(ess_threshold) + np.log(n_particles): + if -logsumexp(normalized_weights * 2) < np.log(ess_threshold) + np.log( + n_particles + ): # Alternative implementation uses a multinomial distribution and only makes n-1 copies, reusing existing one, but fine for now probs = np.exp(normalized_weights) - particles = [copy.deepcopy(particles[np.random.choice(range(len(particles)), p=probs)]) for _ in range(n_particles)] + particles = [ + copy.deepcopy( + particles[np.random.choice(range(len(particles)), p=probs)] + ) + for _ in range(n_particles) + ] avg_weight = w_sum - np.log(n_particles) for p in particles: p.weight = avg_weight - - return particles \ No newline at end of file + + return particles diff --git a/hfppl/inference/smc_steer.py b/hfppl/inference/smc_steer.py index cf0d88f..4a559b0 100644 --- a/hfppl/inference/smc_steer.py +++ b/hfppl/inference/smc_steer.py @@ -3,6 +3,7 @@ import asyncio from ..util import logsumexp, softmax + def find_c(weights, N): # Sort the weights sorted_weights = np.sort(weights) @@ -19,6 +20,7 @@ def find_c(weights, N): return (N - A_val) / B_val return N + def resample_optimal(weights, N): c = find_c(weights, N) # Weights for which c * w >= 1 are deterministically resampled @@ -46,20 +48,21 @@ def resample_optimal(weights, N): else: i += 1 # Concatenate the deterministic and stochastic resampled indices - #resampled = np.concatenate((deterministic, stoch_resampled)) - #return resampled + # resampled = np.concatenate((deterministic, stoch_resampled)) + # return resampled return deterministic, stoch_resampled, c + async def smc_steer(model, n_particles, n_beam): """ Modified sequential Monte Carlo algorithm that uses without-replacement resampling, as described in [our workshop abstract](https://arxiv.org/abs/2306.03081). - + Args: model (hfppl.modeling.Model): The model to perform inference on. n_particles (int): Number of particles to maintain. n_beam (int): Number of continuations to consider for each particle. - + Returns: particles (list[hfppl.modeling.Model]): The completed particles after inference. """ @@ -67,7 +70,7 @@ async def smc_steer(model, n_particles, n_beam): particles = [copy.deepcopy(model) for _ in range(n_particles)] for particle in particles: - particle.start() # TODO: allow to be async? + particle.start() # TODO: allow to be async? while any(map(lambda p: not p.done_stepping(), particles)): # Count the number of finished particles @@ -83,23 +86,29 @@ async def smc_steer(model, n_particles, n_beam): p.weight += np.log(n_total) - np.log(n_particles) else: p.weight += np.log(n_total) - np.log(n_particles) - np.log(n_beam) - super_particles.extend([copy.deepcopy(p) for _ in range(n_beam-1)]) - + super_particles.extend([copy.deepcopy(p) for _ in range(n_beam - 1)]) + # Step each super-particle - await asyncio.gather(*[p.step() for p in super_particles if not p.done_stepping()]) + await asyncio.gather( + *[p.step() for p in super_particles if not p.done_stepping()] + ) # Use optimal resampling to resample W = np.array([p.weight for p in super_particles]) W_tot = logsumexp(W) W_normalized = softmax(W) det_indices, stoch_indices, c = resample_optimal(W_normalized, n_particles) - particles = [super_particles[i] for i in np.concatenate((det_indices, stoch_indices))] + particles = [ + super_particles[i] for i in np.concatenate((det_indices, stoch_indices)) + ] # For deterministic particles: w = w * N/N' for i in det_indices: super_particles[i].weight += np.log(n_particles) - np.log(n_total) # For stochastic particles: w = 1/c * total sum(stoch weights) / num_stoch = sum(stoch weights / total) / num_stoch * total * N/M for i in stoch_indices: - super_particles[i].weight = W_tot - np.log(c) + np.log(n_particles) - np.log(n_total) + super_particles[i].weight = ( + W_tot - np.log(c) + np.log(n_particles) - np.log(n_total) + ) # Return the particles return particles diff --git a/hfppl/llms.py b/hfppl/llms.py index b3bec8a..1e554ec 100644 --- a/hfppl/llms.py +++ b/hfppl/llms.py @@ -5,34 +5,47 @@ import asyncio import string + class Masks: def __init__(self, lm): self.ALL_TOKENS = set(range(len(lm.vocab))) - self.STARTS_NEW_WORD = set(i for (i,v) in enumerate(lm.vocab) if v[0]==' ' and len(v) > 1 and v[1] not in string.whitespace and v[1] not in string.punctuation) - self.CONTINUES_CURRENT_WORD = set(i for (i,v) in enumerate(lm.vocab) if all(c in '\'' or c.isalpha() for c in v)) - self.PUNCTUATION = set(i for (i,v) in enumerate(lm.vocab) if v in ',:;.!?"-') - self.END_SENTENCE_PUNCT = set(i for (i, v) in enumerate(lm.vocab) if v in '.!?') + self.STARTS_NEW_WORD = set( + i + for (i, v) in enumerate(lm.vocab) + if v[0] == " " + and len(v) > 1 + and v[1] not in string.whitespace + and v[1] not in string.punctuation + ) + self.CONTINUES_CURRENT_WORD = set( + i + for (i, v) in enumerate(lm.vocab) + if all(c in "'" or c.isalpha() for c in v) + ) + self.PUNCTUATION = set(i for (i, v) in enumerate(lm.vocab) if v in ',:;.!?"-') + self.END_SENTENCE_PUNCT = set(i for (i, v) in enumerate(lm.vocab) if v in ".!?") class TokenSequence: """A sequence of tokens. - + Supports addition (via `+` or mutating `+=`) with: - + * other `TokenSequence` instances (concatenation) * individual tokens, represented as integers or `Token` instances * strings, which are tokenized by `lm.tokenizer` - + Attributes: lm (hfppl.llms.CachedCausalLM): the language model whose vocabulary the tokens come from. seq (list[hfppl.llms.Token]): the sequence of tokens.""" - + def __init__(self, lm, seq=None): """Create a `TokenSequence` from a language model and a sequence. - + Args: lm (hfppl.llms.CachedCausalLM): the language model whose vocabulary the tokens come from. - seq (str | list[int]): the sequence of token ids, or a string which will be automatically tokenized. Defaults to the singleton sequence containing a bos token.""" + seq (str | list[int]): the sequence of token ids, or a string which will be automatically tokenized. Defaults to the singleton sequence containing a bos token. + """ self.lm = lm if seq is None: self.seq = [lm.tokenizer.bos_token_id] @@ -40,10 +53,10 @@ def __init__(self, lm, seq=None): self.seq = self.lm.tokenizer.encode(seq) else: self.seq = seq - + def __str__(self): return self.lm.tokenizer.decode(self.seq) - + def __iadd__(self, other): if isinstance(other, Token): assert other.lm is self.lm @@ -58,7 +71,7 @@ def __iadd__(self, other): else: raise RuntimeError(f"Addition not supported on {type(other)}") return self - + def __radd__(self, other): if isinstance(other, Token): assert other.lm is self.lm @@ -67,30 +80,35 @@ def __radd__(self, other): assert other.lm is self.lm return TokenSequence(self.lm, other.seq + self.seq) elif isinstance(other, str): - return TokenSequence(self.lm, self.lm.tokenizer.encode(other, add_special_tokens=False) + self.seq) + return TokenSequence( + self.lm, + self.lm.tokenizer.encode(other, add_special_tokens=False) + self.seq, + ) elif isinstance(other, int): return TokenSequence(self.lm, [other, *self.seq]) else: raise RuntimeError(f"Addition not supported on {type(other)}") - + def __add__(self, other): s = TokenSequence(self.lm, self.seq) s += other return s + class Token: """Class representing a token. - + Attributes: lm (hfppl.llms.CachedCausalLM): the language model for which this is a Token. token_id (int): the integer token id (an index into the vocabulary). - token_str (str): a string, which the token represents—equal to `lm.vocab[token_id]`.""" - + token_str (str): a string, which the token represents—equal to `lm.vocab[token_id]`. + """ + def __init__(self, lm, token_id, token_str): - self.lm = lm - self.token_id = token_id + self.lm = lm + self.token_id = token_id self.token_str = token_str - + # Adding tokens def __add__(self, other): s = TokenSequence(self.lm, [self.token_id]) @@ -100,7 +118,7 @@ def __add__(self, other): def __radd__(self, other): s = TokenSequence(self.lm, [self.token_id]) return other + s - + # Support checking for EOS def __eq__(self, other): if isinstance(other, Token): @@ -112,90 +130,120 @@ def __eq__(self, other): def __str__(self): return self.token_str - + def __repr__(self): return f"<{self.token_str}|{self.token_id}>" + class TokenTrie: """Class used internally to cache language model results.""" + # Trie of tokens. - def __init__(self, parent=None, logprobs=None): - self.children = {} # maps token ID to child + def __init__(self, parent=None, logprobs=None): + self.children = {} # maps token ID to child self.logprobs = logprobs # for next token self.past_key_values = None - + def __repr__(self): - return f"{'*' if self.past_key_values is not None else ''}[" + ", ".join([f"{node_id}: {node.__repr__()}" for (node_id, node) in self.children.items()]) + "]" - + return ( + f"{'*' if self.past_key_values is not None else ''}[" + + ", ".join( + [ + f"{node_id}: {node.__repr__()}" + for (node_id, node) in self.children.items() + ] + ) + + "]" + ) + def clear_kv_cache(self): self.past_key_values = None - for (child, node) in self.children.items(): + for child, node in self.children.items(): node.clear_kv_cache() - + def has_token(self, token_id): return token_id in self.children - + def get_token(self, token_id): return self.children[token_id] - + def add_token(self, token_id, logprobs=None): self.children[token_id] = TokenTrie(self, logprobs) return self.children[token_id] - def extend_cache(self, next_token_index, token_ids, logits, base): node = self - + for j in range(next_token_index, len(token_ids)): - token_id = token_ids[j] - token_logits = logits[j-base] - token_logprobs = torch.log_softmax(token_logits, 0) - + token_id = token_ids[j] + token_logits = logits[j - base] + token_logprobs = torch.log_softmax(token_logits, 0) + node = node.add_token(token_id, token_logprobs.cpu().numpy()) - + return node + class Query: """A query to a language model, waiting to be batched.""" - + def __init__(self, prompt, future, past=None): self.prompt = prompt self.future = future self.past = past - + if self.past is not None: - self.past_len = past[0][0].shape[2] # layers, key or value, batch size, num heads, num tokens, head repr length + self.past_len = past[0][0].shape[ + 2 + ] # layers, key or value, batch size, num heads, num tokens, head repr length else: self.past_len = 0 - + @torch.no_grad() def past_padded(self, layer, j, to_length, dtype, device, past_shape): - + if self.past is not None: - return torch.cat((self.past[layer][j], torch.zeros(1, past_shape[1], to_length-self.past_len, past_shape[3], dtype=dtype, device=device)), - dim=2) + return torch.cat( + ( + self.past[layer][j], + torch.zeros( + 1, + past_shape[1], + to_length - self.past_len, + past_shape[3], + dtype=dtype, + device=device, + ), + ), + dim=2, + ) else: - return torch.zeros(1, past_shape[1], to_length, past_shape[3], dtype=dtype, device=device) - + return torch.zeros( + 1, past_shape[1], to_length, past_shape[3], dtype=dtype, device=device + ) + def prompt_padded(self, pad_token, to_length): - return [*self.prompt, *[pad_token for _ in range(to_length-len(self.prompt))]] - - + return [*self.prompt, *[pad_token for _ in range(to_length - len(self.prompt))]] + def attention_mask(self, total_past_length, total_seq_length): - return [*[1 for _ in range(self.past_len)], - *[0 for _ in range(total_past_length-self.past_len)], - *[1 for _ in range(len(self.prompt))], - *[0 for _ in range(total_seq_length-len(self.prompt))]] - + return [ + *[1 for _ in range(self.past_len)], + *[0 for _ in range(total_past_length - self.past_len)], + *[1 for _ in range(len(self.prompt))], + *[0 for _ in range(total_seq_length - len(self.prompt))], + ] + def position_ids(self, total_past_length, total_seq_length): - return [*range(self.past_len, self.past_len + len(self.prompt)), - *[0 for _ in range(total_seq_length-len(self.prompt))]] - + return [ + *range(self.past_len, self.past_len + len(self.prompt)), + *[0 for _ in range(total_seq_length - len(self.prompt))], + ] + class CachedCausalLM: """Wrapper around a HuggingFace causal language model, with support for caching. - + Attributes: model: the underlying HuggingFace model. tokenizer: the underlying HuggingFace tokenizer. @@ -205,33 +253,40 @@ class CachedCausalLM: batch_size (int): when auto-batching, maximum number of queries to process in one batch. timeout (float): number of seconds to wait since last query before processing the current batch of queries, even if not full. """ - + @classmethod def from_pretrained(cls, model_id, auth_token=False, load_in_8bit=True): """Create a [`CachedCausalLM`][hfppl.llms.CachedCausalLM] from a pretrained HuggingFace model. - + Args: model_id (str): the string identifier of the model in HuggingFace's model library. auth_token (str): a HuggingFace API key. Only necessary if using private models, e.g. Meta's Llama models, which require authorization. load_in_8bit (bool): whether to use the `bitsandbytes` library to load the model in 8-bit quantized form. - + Returns: model (hfppl.llms.CachedCausalLM): the LLaMPPL-compatible interface to the HuggingFace model. """ if not auth_token: tok = AutoTokenizer.from_pretrained(model_id) - mod = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=load_in_8bit) + mod = AutoModelForCausalLM.from_pretrained( + model_id, device_map="auto", load_in_8bit=load_in_8bit + ) else: tok = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth_token) - mod = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=auth_token, device_map="auto", load_in_8bit=load_in_8bit) - + mod = AutoModelForCausalLM.from_pretrained( + model_id, + use_auth_token=auth_token, + device_map="auto", + load_in_8bit=load_in_8bit, + ) + return CachedCausalLM(mod, tok) - + @torch.no_grad() def __init__(self, hf_model, hf_tokenizer, batch_size=20): """ Create a `CachedCausalLM` from a loaded HuggingFace model and tokenizer. - + Args: hf_model: a HuggingFace `CausalLM`. hf_tokenizer: a HuggingFace `Tokenizer`. @@ -240,21 +295,28 @@ def __init__(self, hf_model, hf_tokenizer, batch_size=20): self.model = hf_model self.tokenizer = hf_tokenizer self.device = hf_model.device - + # TODO: remove required BOS token if self.tokenizer.bos_token_id is None: - raise RuntimeError("Causal LM has no BOS token, distribution of first word unclear") - + raise RuntimeError( + "Causal LM has no BOS token, distribution of first word unclear" + ) + # Evaluate BOS token - logits = self.model(torch.tensor([[self.tokenizer.bos_token_id]]).to(self.model.device)).logits[0][0] + logits = self.model( + torch.tensor([[self.tokenizer.bos_token_id]]).to(self.model.device) + ).logits[0][0] logprobs = torch.log_softmax(logits, 0) - + self.cache = TokenTrie(None, logprobs.cpu().numpy()) - + # Cache vocabulary - bos_len = len(self.tokenizer.decode([self.tokenizer.bos_token_id])) - self.vocab = [self.tokenizer.decode([self.tokenizer.bos_token_id,i])[bos_len:] for i in range(len(hf_tokenizer.vocab))] - + bos_len = len(self.tokenizer.decode([self.tokenizer.bos_token_id])) + self.vocab = [ + self.tokenizer.decode([self.tokenizer.bos_token_id, i])[bos_len:] + for i in range(len(hf_tokenizer.vocab)) + ] + # Precompute useful masks self.masks = Masks(self) @@ -264,64 +326,98 @@ def __init__(self, hf_model, hf_tokenizer, batch_size=20): self.batch_size = batch_size self.timeout = 0.02 self.timer = None - + def __deepcopy__(self, memo): return self - + def clear_cache(self): """Clear the cache of log probabilities and key/value pairs.""" self.cache = TokenTrie(None, self.cache.logprobs) - + def clear_kv_cache(self): """Clear any key and value vectors from the cache.""" self.cache.clear_kv_cache() - + def reset_async_queries(self): - """Clear any pending language model queries from the queue. Use this method when an exception prevented an inference algorithm from executing + """Clear any pending language model queries from the queue. Use this method when an exception prevented an inference algorithm from executing to completion.""" self.queries = [] - + @torch.no_grad() def cache_kv(self, prompt_tokens): """Cache the key and value vectors for a prompt. Future queries that have this prompt as a prefix will only run the LLM on new tokens. - + Args: prompt_tokens (list[int]): token ids for the prompt to cache. """ result = self.model(torch.tensor([prompt_tokens]).to(self.device)) - + node = self.cache.extend_cache(1, prompt_tokens, result.logits[0], 0) node.past_key_values = result.past_key_values - + @torch.no_grad() def batch_evaluate_queries(self): - + queries, self.queries = self.queries, [] if len(queries) == 0: return - + past_example = next((q.past for q in queries if q.past), False) max_past_length = max(q.past_len for q in queries) max_query_length = max(len(q.prompt) for q in queries) - - padding_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 - - input_ids = torch.tensor([q.prompt_padded(padding_token_id, max_query_length) for q in queries]).to(self.device) - attn_masks = torch.tensor([q.attention_mask(max_past_length, max_query_length) for q in queries]).to(self.device) - posn_ids = torch.tensor([q.position_ids(max_past_length, max_query_length) for q in queries]).to(self.device) + + padding_token_id = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else 0 + ) + + input_ids = torch.tensor( + [q.prompt_padded(padding_token_id, max_query_length) for q in queries] + ).to(self.device) + attn_masks = torch.tensor( + [q.attention_mask(max_past_length, max_query_length) for q in queries] + ).to(self.device) + posn_ids = torch.tensor( + [q.position_ids(max_past_length, max_query_length) for q in queries] + ).to(self.device) if past_example: - pasts = [[torch.cat((*(q.past_padded(layer, j, max_past_length, past_example[0][0].dtype, self.device, past_example[0][0].shape) for q in queries),), dim=0) - for j in range(2)] for layer in range(len(past_example))] + pasts = [ + [ + torch.cat( + ( + *( + q.past_padded( + layer, + j, + max_past_length, + past_example[0][0].dtype, + self.device, + past_example[0][0].shape, + ) + for q in queries + ), + ), + dim=0, + ) + for j in range(2) + ] + for layer in range(len(past_example)) + ] else: pasts = None - - results = self.model(input_ids, attention_mask=attn_masks, - position_ids=posn_ids, past_key_values=pasts, - use_cache=pasts is not None) - - for (i, q) in enumerate(queries): + + results = self.model( + input_ids, + attention_mask=attn_masks, + position_ids=posn_ids, + past_key_values=pasts, + use_cache=pasts is not None, + ) + + for i, q in enumerate(queries): q.future.set_result(results.logits[i]) - + @torch.no_grad() def add_query(self, query, future, past): self.queries.append(Query(query, future, past)) @@ -332,11 +428,13 @@ def add_query(self, query, future, past): if len(self.queries) >= self.batch_size: self.batch_evaluate_queries() else: - self.timer = asyncio.get_running_loop().call_later(self.timeout, lambda: self.batch_evaluate_queries()) - + self.timer = asyncio.get_running_loop().call_later( + self.timeout, lambda: self.batch_evaluate_queries() + ) + def walk_cache(self, token_ids): # Walk while tokens can be found - node = self.cache + node = self.cache next_token_index = 1 past = None @@ -350,60 +448,65 @@ def walk_cache(self, token_ids): next_token_index += 1 else: break - + return node, next_token_index, past, base - + @torch.no_grad() async def next_token_logprobs(self, token_ids): - """Request log probabilities of next token. This version is asynchronous because it automatically batches concurrent requests; use with `await`. - + """Request log probabilities of next token. This version is asynchronous because it automatically batches concurrent requests; use with `await`. + Args: token_ids (list[int]): a list of token ids starting with `tokenizer.bos_token_id`, representing a prompt to the language model. - + Returns: logprobs (numpy.array): a numpy array of `len(vocab)`, with the language model's log (normalized) probabilities for the next token following the prompt. """ - + # Ensure that token list begins with BOS assert token_ids[0] == self.tokenizer.bos_token_id - + node, next_token_index, past, base = self.walk_cache(token_ids) - + # If we processed all tokens, then we're done. if next_token_index == len(token_ids): return node.logprobs - + # Create a future with the prompt future = asyncio.get_running_loop().create_future() self.add_query(token_ids[base:], future, past) logits = await future - + # Create new nodes node = node.extend_cache(next_token_index, token_ids, logits, base) - + return node.logprobs - + @torch.no_grad() def next_token_logprobs_unbatched(self, token_ids): """Request log probabilities of next token. Not asynchronous, and does not support auto-batching. - + Args: token_ids (list[int]): a list of token ids starting with `tokenizer.bos_token_id`, representing a prompt to the language model. - + Returns: - logprobs (numpy.array): a numpy array of `len(vocab)`, with the language model's log (normalized) probabilities for the next token following the prompt.""" - + logprobs (numpy.array): a numpy array of `len(vocab)`, with the language model's log (normalized) probabilities for the next token following the prompt. + """ + # Ensure that token list begins with BOS assert token_ids[0] == self.tokenizer.bos_token_id - + # Walk while tokens can be found node, next_token_index, past, base = self.walk_cache(token_ids) - + if next_token_index == len(token_ids): return node.logprobs - - logits = self.model(torch.tensor([token_ids[base:]]).to(self.device), past_key_values=node.past_key_values, use_cache=node.past_key_values is not None).logits[0] - + + logits = self.model( + torch.tensor([token_ids[base:]]).to(self.device), + past_key_values=node.past_key_values, + use_cache=node.past_key_values is not None, + ).logits[0] + node = node.extend_cache(next_token_index, token_ids, logits, base) - + return node.logprobs diff --git a/hfppl/modeling.py b/hfppl/modeling.py index 3d7be22..b090c02 100644 --- a/hfppl/modeling.py +++ b/hfppl/modeling.py @@ -1,10 +1,11 @@ import copy + class SubModel: def __init__(self): self.parent = None - + async def run_with_parent(self, parent): old_parent = self.parent self.parent = parent @@ -13,37 +14,41 @@ async def run_with_parent(self, parent): return val async def forward(self): - raise NotImplementedError("SubModel.forward() must be implemented by subclasses") + raise NotImplementedError( + "SubModel.forward() must be implemented by subclasses" + ) async def sample(self, dist, proposal=None): return await self.parent.sample(dist, proposal) - + async def observe(self, dist, x): return await self.parent.observe(dist, x) - + async def intervene(self, dist, x): return await self.parent.intervene(dist, x) - + def condition(self, b): return self.parent.condition(b) - + def score(self, score): return self.parent.score(score) - + def twist(self, amt): return self.parent.twist(amt) - + async def call(self, submodel): - return (await submodel.run_with_parent(self.parent)) + return await submodel.run_with_parent(self.parent) + # For use as a decorator import functools + def submodel(f): """Decorator to create a SubModel implementation from an async function. - + For example: - + ```python @submodel async def sample_two_tokens(self, context): @@ -54,25 +59,27 @@ async def sample_two_tokens(self, context): This SubModel can then be used from another model or submodel, using the syntax `await self.call(sample_two_tokens(context))`. """ - @functools.wraps(f, updated=()) # unclear if this is the best way to do it + + @functools.wraps(f, updated=()) # unclear if this is the best way to do it class SubModelImpl(SubModel): def __init__(self, *args, **kwargs): super().__init__() self.args = args self.kwargs = kwargs - + async def forward(self): - return (await f(self, *self.args, **self.kwargs)) - + return await f(self, *self.args, **self.kwargs) + return SubModelImpl + class Model: """Base class for all LLaMPPL models. - + Your models should subclass this class. Minimally, you should provide an `__init__` method that calls `super().__init__(self)`, and a `step` method. """ - + def __init__(self): self.weight = 0.0 self.finished = False @@ -92,68 +99,67 @@ def reset(self): def immutable_properties(self): """Return a `set[str]` of properties that LLaMPPL may assume do not change during execution of `step`. This set is empty by default but can be overridden by subclasses to speed up inference. - + Returns: properties (set[str]): a set of immutable property names""" return set() - - def __deepcopy__(self, memo): + + def __deepcopy__(self, memo): cpy = type(self).__new__(type(self)) immutable = self.immutable_properties() - + for k, v in self.__dict__.items(): if k in immutable: setattr(cpy, k, v) else: setattr(cpy, k, copy.deepcopy(v, memo)) - + return cpy - def twist(self, amt): """Multiply this particle's weight by `exp(amt)`, but divide it back out before the next `step`. - + Use this method to provide heuristic guidance about whether a particle is "on the right track" without changing the ultimate target distribution. - + Args: amt: the logarithm of the amount by which to (temporarily) multiply this particle's weight. """ self.twist_amount += amt self.score(amt) - + def untwist(self): self.score(-self.twist_amount) self.twist_amount = 0.0 - + def finish(self): self.untwist() self.finished = True - + def done_stepping(self): return self.finished async def step(self): """Defines the computation performed in each step of the model. - + All subclasses should override this method.""" - + if not self.done_stepping(): raise NotImplementedError("Model.step() must be implemented by subclasses") - + def __str__(self): return "Particle" - + def start(self): pass - + def score(self, score): """Multiply this particle's weight by `exp(score)`. - + The `score` method is a low-level way to change the target distribution. For many use cases, it is sufficient to use `sample`, `observe`, `condition`, and `twist`, all of which are implemented in terms of `score`. - + Args: score: logarithm of the amount by which the particle's weight should be multiplied. """ @@ -161,54 +167,54 @@ def score(self, score): def condition(self, b): """Constrain a given Boolean expression to be `True`. - + If the condition is False, the particle's weight is set to zero and `self.finish()` is called, so that no further `step` calls are made. - + Args: b: the Boolean expression whose value is constrained to be True. """ if not b: - self.score(float('-inf')) + self.score(float("-inf")) self.finish() - + async def intervene(self, dist, x): """Force the distribution to take on the value `x`, but do not _condition_ on this result. - + This is useful primarily with distributions that have side effects (e.g., modifying some state). For example, a model with the code - + ```python token_1 = await self.sample(self.stateful_lm.next_token()) await self.observe(self.stateful_lm.next_token(), token_2) ``` - + encodes a posterior inference problem, to find `token_1` values that *likely preceded* `token_2`. By contrast, - + ```python token_1 = await self.sample(stateful_lm.next_token()) await self.intervene(self.stateful_lm.next_token(), token_2) ``` - + encodes a much easier task: freely generate `token_1` and then force-feed `token_2` as the following token. - + Args: dist (hfppl.distributions.distribution.Distribution): the distribution on which to intervene. x: the value to intervene with. """ await dist.log_prob(x) return x - + async def observe(self, dist, x): """Condition the model on the value `x` being sampled from the distribution `dist`. - + For discrete distributions `dist`, `await self.observe(dist, x)` specifies the same constraint as ``` val = await self.sample(dist) self.condition(val == x) ``` but can be much more efficient. - + Args: dist: a `Distribution` object from which to observe x: the value observed from `dist` @@ -216,16 +222,16 @@ async def observe(self, dist, x): p = await dist.log_prob(x) self.score(p) return x - + async def sample(self, dist, proposal=None): - """Extend the model with a sample from a given `Distribution`, with support for autobatching. + """Extend the model with a sample from a given `Distribution`, with support for autobatching. If specified, the Distribution `proposal` is used during inference to generate informed hypotheses. - + Args: dist: the `Distribution` object from which to sample proposal: if provided, inference algorithms will use this `Distribution` object to generate proposed samples, rather than `dist`. However, importance weights will be adjusted so that the target posterior is independent of the proposal. - + Returns: value: the value sampled from the distribution. """ @@ -238,7 +244,7 @@ async def sample(self, dist, proposal=None): # else: # self.score(w) # return x - + if proposal is None: x, _ = await dist.sample() return x @@ -247,6 +253,6 @@ async def sample(self, dist, proposal=None): p = await dist.log_prob(x) self.score(p - q) return x - + async def call(self, submodel): - return await submodel.run_with_parent(self) \ No newline at end of file + return await submodel.run_with_parent(self) diff --git a/hfppl/util.py b/hfppl/util.py index ee01d7e..2831945 100644 --- a/hfppl/util.py +++ b/hfppl/util.py @@ -2,20 +2,23 @@ import numpy as np + def logsumexp(nums): m = np.max(nums) return np.log(np.sum(np.exp(nums - m))) + m - + + def log_softmax(nums): """Compute log(softmax(nums)). - + Args: nums: a vector or numpy array of unnormalized log probabilities. - + Returns: np.array: an array of log (normalized) probabilities. """ return nums - logsumexp(nums) + def softmax(nums): - return np.exp(log_softmax(nums)) \ No newline at end of file + return np.exp(log_softmax(nums))