Skip to content

Commit

Permalink
Modify get_cost to account for prompt caching
Browse files Browse the repository at this point in the history
  • Loading branch information
yadavsahil197 committed Oct 2, 2024
1 parent d1417b2 commit 5c07717
Show file tree
Hide file tree
Showing 12 changed files with 64 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/autolabel/models/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None) -> float:
num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
if label:
num_label_toks = len(self.tokenizer.encode(label).ids)
Expand Down
10 changes: 8 additions & 2 deletions src/autolabel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ async def label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResul
new_results = self._label(missing_prompts, output_schema)
for ind, prompt in enumerate(missing_prompts):
costs.append(
self.get_cost(prompt, label=new_results.generations[ind][0].text)
self.get_cost(
prompt,
label=new_results.generations[ind][0].text,
llm_output=new_results.llm_output,
)
)

# Set the existing prompts to the new results
Expand All @@ -77,7 +81,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
pass

@abstractmethod
def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
pass

def get_cached_prompts(self, prompts: List[str]) -> Optional[str]:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
num_prompt_toks = len(self.co.tokenize(prompt).tokens)
if label:
num_label_toks = len(self.co.tokenize(label).tokens)
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
if self.model_name is None:
return 0.0
cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/hf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
# Model inference for this model is being run locally
# Revisit this in the future when we support HF inference endpoints
return 0.0
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/hf_pipeline_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
generations=generations, errors=[None] * len(generations)
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
# Model inference for this model is being run locally
# Revisit this in the future when we support HF inference endpoints
return 0.0
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
generations=generations, errors=errors, latencies=latencies
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
cost_per_prompt_char = self.COST_PER_PROMPT_TOKEN[self.model_name]
cost_per_completion_char = self.COST_PER_COMPLETION_TOKEN[self.model_name]
return cost_per_prompt_char * len(prompt) + cost_per_completion_char * (
Expand Down
36 changes: 27 additions & 9 deletions src/autolabel/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
end_time = time()
return RefuelLLMResult(
generations=generations,
llm_output=result.llm_output,
errors=[None] * len(generations),
latencies=[end_time - start_time] * len(generations),
)
Expand Down Expand Up @@ -307,6 +308,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
end_time = time()
return RefuelLLMResult(
generations=generations,
llm_output=result.llm_output,
errors=[None] * len(generations),
latencies=[end_time - start_time] * len(generations),
)
Expand Down Expand Up @@ -339,19 +341,35 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
num_label_toks = len(encoding.encode(label))
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
num_cached_toks = 0
if llm_output and "token_usage" in llm_output:
num_prompt_toks = llm_output["token_usage"]["prompt_tokens"]
num_label_toks = llm_output["token_usage"]["completion_tokens"]
num_cached_toks = (
llm_output["token_usage"]
.get("prompt_tokens_details", {})
.get("cached_tokens", 0)
)
num_prompt_toks -= num_cached_toks
else:
# get an upper bound
num_label_toks = self.model_params["max_tokens"]
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
num_label_toks = len(encoding.encode(label))
else:
# get an upper bound
num_label_toks = self.model_params["max_tokens"]

cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
cost_per_cached_prompt_token = cost_per_prompt_token / 2.0
cost_per_completion_token = self.COST_PER_COMPLETION_TOKEN[self.model_name]
return (num_prompt_toks * cost_per_prompt_token) + (
num_label_toks * cost_per_completion_token
return (
(num_prompt_toks * cost_per_prompt_token)
+ (num_cached_toks * cost_per_cached_prompt_token)
+ (num_label_toks * cost_per_completion_token)
)

def returns_token_probs(self) -> bool:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[time() - start_time] * len(generations),
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/refuelV2.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,9 @@ def _prepare_output_schema(self, schema: Dict) -> Dict:
curr_schema[key] = self._prepare_output_schema(curr_schema[key])
return curr_schema

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
return 0

def returns_token_probs(self) -> bool:
Expand Down
6 changes: 4 additions & 2 deletions src/autolabel/models/vllm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional
from typing import Dict, List, Optional

from autolabel.models import BaseModel
from autolabel.configs import AutolabelConfig
Expand Down Expand Up @@ -115,7 +115,9 @@ def _process_confidence_request(self, logprobs):
resp.append({curr_logprob_obj.decoded_token: curr_logprob_obj.logprob})
return resp

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
return 0

def returns_token_probs(self) -> bool:
Expand Down
3 changes: 3 additions & 0 deletions src/autolabel/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ class RefuelLLMResult(BaseModel):

generations: List[List[Union[Generation, ChatGeneration]]]

"""Arbitrary LLM provider-specific output."""
llm_output: Optional[dict] = None

"""Errors encountered while running the labeling job"""
errors: List[Optional[LabelingError]]

Expand Down

0 comments on commit 5c07717

Please sign in to comment.