-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
--------- Co-authored-by: Logan Grado <[email protected]>
- Loading branch information
1 parent
d660f0d
commit 3cc7c18
Showing
14 changed files
with
3,350 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
.DS_Store | ||
.idea | ||
*.log | ||
tmp/ | ||
|
||
*.csv | ||
*.ipynb |
471 changes: 471 additions & 0 deletions
471
blogs/artificial-intelligence/pytorch-tunableop/README.md
Large diffs are not rendered by default.
Oops, something went wrong.
383 changes: 383 additions & 0 deletions
383
blogs/artificial-intelligence/pytorch-tunableop/README.mdt
Large diffs are not rendered by default.
Oops, something went wrong.
13 changes: 13 additions & 0 deletions
13
blogs/artificial-intelligence/pytorch-tunableop/docker/docker-compose.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
services: | ||
blog: | ||
image: blog/tunableop | ||
build: | ||
context: .. | ||
dockerfile: ./docker/dockerfile | ||
devices: | ||
- /dev/kfd | ||
- /dev/dri | ||
volumes: | ||
- ../:/root/workdir | ||
- ~/.cache:/root/.cache | ||
command: /bin/bash |
17 changes: 17 additions & 0 deletions
17
blogs/artificial-intelligence/pytorch-tunableop/docker/dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
ARG PY=3.10 | ||
FROM rocm/dev-ubuntu-22.04:6.0.2-complete | ||
ARG PY | ||
|
||
ARG DEBIAN_FRONTEND=noninteractive | ||
RUN apt-get update && apt-get install -y software-properties-common \ | ||
&& add-apt-repository ppa:deadsnakes/ppa \ | ||
&& apt-get update && apt-get install -y python${PY} python${PY}-venv python${PY}-dev git | ||
|
||
RUN curl -sSL https://pdm-project.org/install-pdm.py | python${PY} - | ||
|
||
# Install using PDM | ||
COPY pyproject.toml pdm.lock ./ | ||
RUN --mount=type=cache,target=/root/.cache /root/.local/bin/pdm sync | ||
ENV PATH="/.venv/bin:${PATH}" | ||
|
||
WORKDIR /root/workdir |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+10.5 KB
blogs/artificial-intelligence/pytorch-tunableop/images/tunableop_blog_34_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,259 changes: 2,259 additions & 0 deletions
2,259
blogs/artificial-intelligence/pytorch-tunableop/pdm.lock
Large diffs are not rendered by default.
Oops, something went wrong.
35 changes: 35 additions & 0 deletions
35
blogs/artificial-intelligence/pytorch-tunableop/pyproject.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
[project] | ||
name = "pytorch-tunableop" | ||
version = "0.1.0" | ||
description = "" | ||
authors = [ | ||
{name = "Logan Grado", email = "[email protected]"}, | ||
] | ||
dependencies = [ | ||
# "torch==2.4.0.dev20240423", | ||
"transformers>=4.39.3", | ||
"pandas>=2.2.2", | ||
"matplotlib>=3.8.4", | ||
"click>=8.1.7", | ||
"torch==2.3.0", | ||
] | ||
requires-python = ">=3.10,<3.11" | ||
readme = "README.md" | ||
license = {text = "MIT"} | ||
|
||
[[tool.pdm.source]] | ||
name = "torch" | ||
url = "https://download.pytorch.org/whl/rocm6.0/" | ||
|
||
[tool.pdm] | ||
distribution = false | ||
|
||
[tool.pdm.dev-dependencies] | ||
dev = [ | ||
"ipython>=8.21.0", | ||
"ipdb>=0.13.13", | ||
"jupyter>=1.0.0", | ||
"jupyterlab>=4.1.2", | ||
"jupytext>=1.16.1", | ||
"nbconvert>=7.16.2", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import jinja2 | ||
import ast | ||
import inspect | ||
import subprocess | ||
|
||
|
||
def include_source(name, func=None): | ||
with open(name, "r") as f: | ||
source = f.read() | ||
|
||
if func: | ||
tree = ast.parse(source) | ||
|
||
# Search for FunctionDef nodes with the specified function name | ||
found = False | ||
for node in ast.walk(tree): | ||
if isinstance(node, ast.FunctionDef) and node.name == func: | ||
# Return the source code for the found function | ||
# Retrieve any decorator lines first | ||
decorator_lines = "" | ||
for decorator in node.decorator_list: | ||
decorator_lines += f"@{ast.get_source_segment(source, decorator)}\n" | ||
|
||
# Retrieve function body next | ||
source = decorator_lines + ast.get_source_segment(source, node) | ||
found = True | ||
break | ||
|
||
if not found: | ||
raise ValueError(f"Unable to find function: {func}") | ||
|
||
source = source.strip() | ||
return source | ||
|
||
|
||
def main(): | ||
src = "README.mdt" | ||
dest = "README.md" | ||
|
||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(".")) | ||
env.globals["include_source"] = include_source | ||
|
||
template = env.get_template(src) | ||
|
||
output = template.render() | ||
|
||
with open(dest, "w") as f: | ||
f.write(output) | ||
|
||
subprocess.run( | ||
[ | ||
"markdownlint-cli2", | ||
"--fix", | ||
"--config", | ||
"../../../.markdownlint.yaml", | ||
"README.md", | ||
] | ||
) | ||
|
||
print(f"Rendered {src} -> {dest}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
67 changes: 67 additions & 0 deletions
67
blogs/artificial-intelligence/pytorch-tunableop/src/llm.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import os | ||
import torch | ||
import transformers | ||
import click | ||
|
||
|
||
# Use Click to parse command-line arguments | ||
@click.command | ||
@click.option("--tune", is_flag=True) | ||
def main(tune): | ||
# Set some variables | ||
seq_len = 256 # Max sequence length to generate | ||
n_batches = 8 # Number of batches to time | ||
n_warmup = 2 # Number of warmup batches | ||
prompt = ["Hello Earthlings!"] # Input prompt | ||
|
||
# We can enable tuning by setting the environment variables within the code - as long as we do so before | ||
# using torch. This is often less cumbersome than passing the environment variables each time | ||
if tune: | ||
print("Tuning enabled") | ||
os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1" # Enable tuning | ||
os.environ["PYTORCH_TUNABLEOP_FILENAME"] = "src/llm_result.csv" # Specify output file | ||
|
||
# Retrieve the model and tokenizer | ||
model = "google/gemma-2b" | ||
tokenizer = transformers.AutoTokenizer.from_pretrained(model) | ||
model = transformers.AutoModelForCausalLM.from_pretrained(model).to("cuda") | ||
|
||
# Set the model to use a static KV cache - see https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=generation_config#static-kv-cache-and-torchcompile | ||
model.generation_config.cache_implementation = "static" | ||
|
||
# Tokenize our input. | ||
# Use padding with `pad_to_multiple_of` to minimize the number of GEMMs to tune | ||
# Larger values => Less GEMMs to tune, but more potential overhead for shorter prompts | ||
inputs = tokenizer(prompt, return_tensors="pt", padding=True, pad_to_multiple_of=8).to("cuda") | ||
|
||
# Determine how many tokens to generate. Here, we need to subtract the number of tokens in the prompt to keep the same | ||
# overall sequence length | ||
n_tokens = seq_len - inputs["input_ids"].shape[1] # number of tokens to generate | ||
|
||
t0 = torch.cuda.Event(enable_timing=True) | ||
t1 = torch.cuda.Event(enable_timing=True) | ||
for i in range(n_batches + n_warmup): | ||
# Don't start timing until we've finished our warmup iters | ||
if i == n_warmup: | ||
torch.cuda.synchronize() | ||
t0.record() | ||
|
||
# Generate! | ||
model.generate( | ||
**inputs, | ||
max_new_tokens=n_tokens, # Force the model to generate exactly n_tokens before stopping | ||
min_new_tokens=n_tokens, | ||
use_cache=True, # Ensure we use the kv-cache | ||
) | ||
|
||
# Complete timing, synchronize, and compute elapsed time | ||
t1.record() | ||
torch.cuda.synchronize() | ||
dt = t0.elapsed_time(t1) / 1000 | ||
|
||
tokens_per_second = n_batches * n_tokens / dt | ||
print(f" Tokens/second: {tokens_per_second:0.4f} ({n_tokens*n_batches} tokens, {dt:0.2f} seconds)") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
29 changes: 29 additions & 0 deletions
29
blogs/artificial-intelligence/pytorch-tunableop/src/matmul.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import torch | ||
|
||
|
||
def time_matmul(M, N, K): | ||
n_iter = 10000 # Number of iterations to time | ||
n_warmup = 10 # Number of warmup iterations | ||
|
||
t0 = torch.cuda.Event(enable_timing=True) | ||
t1 = torch.cuda.Event(enable_timing=True) | ||
|
||
# Construct input matrices | ||
A = torch.rand(M, K, device="cuda") | ||
B = torch.rand(K, N, device="cuda") | ||
|
||
# Benchmark the GEMM | ||
for i in range(n_iter + n_warmup): | ||
if i == n_warmup: | ||
t0.record() # Don't start recording until warmup is finished | ||
C = A @ B | ||
|
||
# Compute elapsed time | ||
t1.record() | ||
torch.cuda.synchronize() | ||
dt = t0.elapsed_time(t1) / 1000 | ||
|
||
print(f"{n_iter/dt:0.2f} iter/s ({dt:0.4g}s)") | ||
|
||
|
||
time_matmul(512, 1024, 2048) |