Adding PyTorch TunableOp blog

--------- Co-authored-by: Logan Grado <[email protected]>
ROCm · Jul 3, 2024 · 3cc7c18 · 3cc7c18
1 parent d660f0d
commit 3cc7c18
Show file tree

Hide file tree

Showing 14 changed files with 3,350 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,5 +19,7 @@ docs/about/release-notes.md
 docs/about/CHANGELOG.md
 
 .ipynb_checkpoints
+__pycache__
+.pdm-python
 .vs/*
 blogs/artificial-intelligence/.vs/*
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
@@ -16,3 +16,4 @@ ignores:
   - docs/CHANGELOG.md
   - "{,docs/}{RELEASE,release}.md"
   - tools/autotag/templates/**/*.md
+  - "**/source.md"
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/.gitignore b/blogs/artificial-intelligence/pytorch-tunableop/.gitignore
@@ -0,0 +1,7 @@
+.DS_Store
+.idea
+*.log
+tmp/
+
+*.csv
+*.ipynb
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/README.md b/blogs/artificial-intelligence/pytorch-tunableop/README.md
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/README.mdt b/blogs/artificial-intelligence/pytorch-tunableop/README.mdt
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/docker/docker-compose.yaml b/blogs/artificial-intelligence/pytorch-tunableop/docker/docker-compose.yaml
@@ -0,0 +1,13 @@
+services:
+  blog:
+    image: blog/tunableop
+    build:
+      context: ..
+      dockerfile: ./docker/dockerfile
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    volumes:
+      - ../:/root/workdir
+      - ~/.cache:/root/.cache
+    command: /bin/bash
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/docker/dockerfile b/blogs/artificial-intelligence/pytorch-tunableop/docker/dockerfile
@@ -0,0 +1,17 @@
+ARG PY=3.10
+FROM rocm/dev-ubuntu-22.04:6.0.2-complete
+ARG PY
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y python${PY} python${PY}-venv python${PY}-dev git
+
+RUN curl -sSL https://pdm-project.org/install-pdm.py | python${PY} -
+
+# Install using PDM
+COPY pyproject.toml pdm.lock ./
+RUN --mount=type=cache,target=/root/.cache /root/.local/bin/pdm sync
+ENV PATH="/.venv/bin:${PATH}"
+
+WORKDIR /root/workdir
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/images/GEMM.png b/blogs/artificial-intelligence/pytorch-tunableop/images/GEMM.png
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/images/tunableop_blog_34_0.png b/blogs/artificial-intelligence/pytorch-tunableop/images/tunableop_blog_34_0.png
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/pdm.lock b/blogs/artificial-intelligence/pytorch-tunableop/pdm.lock
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/pyproject.toml b/blogs/artificial-intelligence/pytorch-tunableop/pyproject.toml
@@ -0,0 +1,35 @@
+[project]
+name = "pytorch-tunableop"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Logan Grado", email = "[email protected]"},
+]
+dependencies = [
+    # "torch==2.4.0.dev20240423",
+    "transformers>=4.39.3",
+    "pandas>=2.2.2",
+    "matplotlib>=3.8.4",
+    "click>=8.1.7",
+    "torch==2.3.0",
+]
+requires-python = ">=3.10,<3.11"
+readme = "README.md"
+license = {text = "MIT"}
+
+[[tool.pdm.source]]
+name = "torch"
+url = "https://download.pytorch.org/whl/rocm6.0/"
+
+[tool.pdm]
+distribution = false
+
+[tool.pdm.dev-dependencies]
+dev = [
+    "ipython>=8.21.0",
+    "ipdb>=0.13.13",
+    "jupyter>=1.0.0",
+    "jupyterlab>=4.1.2",
+    "jupytext>=1.16.1",
+    "nbconvert>=7.16.2",
+]
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/render.py b/blogs/artificial-intelligence/pytorch-tunableop/render.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import jinja2
+import ast
+import inspect
+import subprocess
+
+
+def include_source(name, func=None):
+    with open(name, "r") as f:
+        source = f.read()
+
+    if func:
+        tree = ast.parse(source)
+
+        # Search for FunctionDef nodes with the specified function name
+        found = False
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef) and node.name == func:
+                # Return the source code for the found function
+                # Retrieve any decorator lines first
+                decorator_lines = ""
+                for decorator in node.decorator_list:
+                    decorator_lines += f"@{ast.get_source_segment(source, decorator)}\n"
+
+                # Retrieve function body next
+                source = decorator_lines + ast.get_source_segment(source, node)
+                found = True
+                break
+
+        if not found:
+            raise ValueError(f"Unable to find function: {func}")
+
+    source = source.strip()
+    return source
+
+
+def main():
+    src = "README.mdt"
+    dest = "README.md"
+
+    env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
+    env.globals["include_source"] = include_source
+
+    template = env.get_template(src)
+
+    output = template.render()
+
+    with open(dest, "w") as f:
+        f.write(output)
+
+    subprocess.run(
+        [
+            "markdownlint-cli2",
+            "--fix",
+            "--config",
+            "../../../.markdownlint.yaml",
+            "README.md",
+        ]
+    )
+
+    print(f"Rendered {src} -> {dest}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/src/llm.py b/blogs/artificial-intelligence/pytorch-tunableop/src/llm.py
@@ -0,0 +1,67 @@
+import os
+import torch
+import transformers
+import click
+
+
+# Use Click to parse command-line arguments
+@click.command
+@click.option("--tune", is_flag=True)
+def main(tune):
+    # Set some variables
+    seq_len = 256  # Max sequence length to generate
+    n_batches = 8  # Number of batches to time
+    n_warmup = 2  # Number of warmup batches
+    prompt = ["Hello Earthlings!"]  # Input prompt
+
+    # We can enable tuning by setting the environment variables within the code - as long as we do so before
+    # using torch. This is often less cumbersome than passing the environment variables each time
+    if tune:
+        print("Tuning enabled")
+        os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1"  # Enable tuning
+        os.environ["PYTORCH_TUNABLEOP_FILENAME"] = "src/llm_result.csv"  # Specify output file
+
+    # Retrieve the model and tokenizer
+    model = "google/gemma-2b"
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model)
+    model = transformers.AutoModelForCausalLM.from_pretrained(model).to("cuda")
+
+    # Set the model to use a static KV cache - see https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=generation_config#static-kv-cache-and-torchcompile
+    model.generation_config.cache_implementation = "static"
+
+    # Tokenize our input.
+    # Use padding with `pad_to_multiple_of` to minimize the number of GEMMs to tune
+    # Larger values => Less GEMMs to tune, but more potential overhead for shorter prompts
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True, pad_to_multiple_of=8).to("cuda")
+
+    # Determine how many tokens to generate. Here, we need to subtract the number of tokens in the prompt to keep the same
+    # overall sequence length
+    n_tokens = seq_len - inputs["input_ids"].shape[1]  # number of tokens to generate
+
+    t0 = torch.cuda.Event(enable_timing=True)
+    t1 = torch.cuda.Event(enable_timing=True)
+    for i in range(n_batches + n_warmup):
+        # Don't start timing until we've finished our warmup iters
+        if i == n_warmup:
+            torch.cuda.synchronize()
+            t0.record()
+
+        # Generate!
+        model.generate(
+            **inputs,
+            max_new_tokens=n_tokens,  # Force the model to generate exactly n_tokens before stopping
+            min_new_tokens=n_tokens,
+            use_cache=True,  # Ensure we use the kv-cache
+        )
+
+    # Complete timing, synchronize, and compute elapsed time
+    t1.record()
+    torch.cuda.synchronize()
+    dt = t0.elapsed_time(t1) / 1000
+
+    tokens_per_second = n_batches * n_tokens / dt
+    print(f"  Tokens/second: {tokens_per_second:0.4f} ({n_tokens*n_batches} tokens, {dt:0.2f} seconds)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/blogs/artificial-intelligence/pytorch-tunableop/src/matmul.py b/blogs/artificial-intelligence/pytorch-tunableop/src/matmul.py
@@ -0,0 +1,29 @@
+import torch
+
+
+def time_matmul(M, N, K):
+    n_iter = 10000  # Number of iterations to time
+    n_warmup = 10  # Number of warmup iterations
+
+    t0 = torch.cuda.Event(enable_timing=True)
+    t1 = torch.cuda.Event(enable_timing=True)
+
+    # Construct input matrices
+    A = torch.rand(M, K, device="cuda")
+    B = torch.rand(K, N, device="cuda")
+
+    # Benchmark the GEMM
+    for i in range(n_iter + n_warmup):
+        if i == n_warmup:
+            t0.record()  # Don't start recording until warmup is finished
+        C = A @ B
+
+    # Compute elapsed time
+    t1.record()
+    torch.cuda.synchronize()
+    dt = t0.elapsed_time(t1) / 1000
+
+    print(f"{n_iter/dt:0.2f} iter/s ({dt:0.4g}s)")
+
+
+time_matmul(512, 1024, 2048)