Skip to content

Commit

Permalink
Adding PyTorch TunableOp blog
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Logan Grado <[email protected]>
  • Loading branch information
Danny213123 and loggrado committed Jul 3, 2024
1 parent d660f0d commit 3cc7c18
Show file tree
Hide file tree
Showing 14 changed files with 3,350 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,7 @@ docs/about/release-notes.md
docs/about/CHANGELOG.md

.ipynb_checkpoints
__pycache__
.pdm-python
.vs/*
blogs/artificial-intelligence/.vs/*
1 change: 1 addition & 0 deletions .markdownlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ ignores:
- docs/CHANGELOG.md
- "{,docs/}{RELEASE,release}.md"
- tools/autotag/templates/**/*.md
- "**/source.md"
7 changes: 7 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.DS_Store
.idea
*.log
tmp/

*.csv
*.ipynb
471 changes: 471 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/README.md

Large diffs are not rendered by default.

383 changes: 383 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/README.mdt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
services:
blog:
image: blog/tunableop
build:
context: ..
dockerfile: ./docker/dockerfile
devices:
- /dev/kfd
- /dev/dri
volumes:
- ../:/root/workdir
- ~/.cache:/root/.cache
command: /bin/bash
17 changes: 17 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/docker/dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
ARG PY=3.10
FROM rocm/dev-ubuntu-22.04:6.0.2-complete
ARG PY

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y python${PY} python${PY}-venv python${PY}-dev git

RUN curl -sSL https://pdm-project.org/install-pdm.py | python${PY} -

# Install using PDM
COPY pyproject.toml pdm.lock ./
RUN --mount=type=cache,target=/root/.cache /root/.local/bin/pdm sync
ENV PATH="/.venv/bin:${PATH}"

WORKDIR /root/workdir
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,259 changes: 2,259 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/pdm.lock

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[project]
name = "pytorch-tunableop"
version = "0.1.0"
description = ""
authors = [
{name = "Logan Grado", email = "[email protected]"},
]
dependencies = [
# "torch==2.4.0.dev20240423",
"transformers>=4.39.3",
"pandas>=2.2.2",
"matplotlib>=3.8.4",
"click>=8.1.7",
"torch==2.3.0",
]
requires-python = ">=3.10,<3.11"
readme = "README.md"
license = {text = "MIT"}

[[tool.pdm.source]]
name = "torch"
url = "https://download.pytorch.org/whl/rocm6.0/"

[tool.pdm]
distribution = false

[tool.pdm.dev-dependencies]
dev = [
"ipython>=8.21.0",
"ipdb>=0.13.13",
"jupyter>=1.0.0",
"jupyterlab>=4.1.2",
"jupytext>=1.16.1",
"nbconvert>=7.16.2",
]
66 changes: 66 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/render.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python3

import jinja2
import ast
import inspect
import subprocess


def include_source(name, func=None):
with open(name, "r") as f:
source = f.read()

if func:
tree = ast.parse(source)

# Search for FunctionDef nodes with the specified function name
found = False
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name == func:
# Return the source code for the found function
# Retrieve any decorator lines first
decorator_lines = ""
for decorator in node.decorator_list:
decorator_lines += f"@{ast.get_source_segment(source, decorator)}\n"

# Retrieve function body next
source = decorator_lines + ast.get_source_segment(source, node)
found = True
break

if not found:
raise ValueError(f"Unable to find function: {func}")

source = source.strip()
return source


def main():
src = "README.mdt"
dest = "README.md"

env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
env.globals["include_source"] = include_source

template = env.get_template(src)

output = template.render()

with open(dest, "w") as f:
f.write(output)

subprocess.run(
[
"markdownlint-cli2",
"--fix",
"--config",
"../../../.markdownlint.yaml",
"README.md",
]
)

print(f"Rendered {src} -> {dest}")


if __name__ == "__main__":
main()
67 changes: 67 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/src/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import torch
import transformers
import click


# Use Click to parse command-line arguments
@click.command
@click.option("--tune", is_flag=True)
def main(tune):
# Set some variables
seq_len = 256 # Max sequence length to generate
n_batches = 8 # Number of batches to time
n_warmup = 2 # Number of warmup batches
prompt = ["Hello Earthlings!"] # Input prompt

# We can enable tuning by setting the environment variables within the code - as long as we do so before
# using torch. This is often less cumbersome than passing the environment variables each time
if tune:
print("Tuning enabled")
os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1" # Enable tuning
os.environ["PYTORCH_TUNABLEOP_FILENAME"] = "src/llm_result.csv" # Specify output file

# Retrieve the model and tokenizer
model = "google/gemma-2b"
tokenizer = transformers.AutoTokenizer.from_pretrained(model)
model = transformers.AutoModelForCausalLM.from_pretrained(model).to("cuda")

# Set the model to use a static KV cache - see https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=generation_config#static-kv-cache-and-torchcompile
model.generation_config.cache_implementation = "static"

# Tokenize our input.
# Use padding with `pad_to_multiple_of` to minimize the number of GEMMs to tune
# Larger values => Less GEMMs to tune, but more potential overhead for shorter prompts
inputs = tokenizer(prompt, return_tensors="pt", padding=True, pad_to_multiple_of=8).to("cuda")

# Determine how many tokens to generate. Here, we need to subtract the number of tokens in the prompt to keep the same
# overall sequence length
n_tokens = seq_len - inputs["input_ids"].shape[1] # number of tokens to generate

t0 = torch.cuda.Event(enable_timing=True)
t1 = torch.cuda.Event(enable_timing=True)
for i in range(n_batches + n_warmup):
# Don't start timing until we've finished our warmup iters
if i == n_warmup:
torch.cuda.synchronize()
t0.record()

# Generate!
model.generate(
**inputs,
max_new_tokens=n_tokens, # Force the model to generate exactly n_tokens before stopping
min_new_tokens=n_tokens,
use_cache=True, # Ensure we use the kv-cache
)

# Complete timing, synchronize, and compute elapsed time
t1.record()
torch.cuda.synchronize()
dt = t0.elapsed_time(t1) / 1000

tokens_per_second = n_batches * n_tokens / dt
print(f" Tokens/second: {tokens_per_second:0.4f} ({n_tokens*n_batches} tokens, {dt:0.2f} seconds)")


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions blogs/artificial-intelligence/pytorch-tunableop/src/matmul.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import torch


def time_matmul(M, N, K):
n_iter = 10000 # Number of iterations to time
n_warmup = 10 # Number of warmup iterations

t0 = torch.cuda.Event(enable_timing=True)
t1 = torch.cuda.Event(enable_timing=True)

# Construct input matrices
A = torch.rand(M, K, device="cuda")
B = torch.rand(K, N, device="cuda")

# Benchmark the GEMM
for i in range(n_iter + n_warmup):
if i == n_warmup:
t0.record() # Don't start recording until warmup is finished
C = A @ B

# Compute elapsed time
t1.record()
torch.cuda.synchronize()
dt = t0.elapsed_time(t1) / 1000

print(f"{n_iter/dt:0.2f} iter/s ({dt:0.4g}s)")


time_matmul(512, 1024, 2048)

0 comments on commit 3cc7c18

Please sign in to comment.