From 0d66c8e854c95b84344e1d77ce5d111014a95e73 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 18 Dec 2024 21:14:23 +0100 Subject: [PATCH] Revert "try changes from #3036" This reverts commit 2a4b81888266fb1829b1d4b739001d3c2d4270cd. --- benchmarks/triton_kernels_benchmark/__init__.py | 2 +- benchmarks/triton_kernels_benchmark/benchmark_testing.py | 8 -------- .../flash_attention_fwd_benchmark.py | 2 +- benchmarks/triton_kernels_benchmark/fused_softmax.py | 1 - benchmarks/triton_kernels_benchmark/gemm_benchmark.py | 2 -- .../gemm_postop_addmatrix_benchmark.py | 2 -- .../gemm_postop_gelu_benchmark.py | 2 -- .../triton_kernels_benchmark/gemm_preop_exp_benchmark.py | 2 -- .../triton_kernels_benchmark/gemm_splitk_benchmark.py | 1 - .../triton_kernels_benchmark/gemm_streamk_benchmark.py | 2 -- python/triton/runtime/autotuner.py | 2 +- 11 files changed, 3 insertions(+), 23 deletions(-) diff --git a/benchmarks/triton_kernels_benchmark/__init__.py b/benchmarks/triton_kernels_benchmark/__init__.py index 43b1f9722b..02857fdd99 100644 --- a/benchmarks/triton_kernels_benchmark/__init__.py +++ b/benchmarks/triton_kernels_benchmark/__init__.py @@ -1,4 +1,4 @@ -from .benchmark_testing import do_bench, make_do_bench_for_autotune, assert_close, perf_report, Benchmark, USE_IPEX_OPTION, BENCHMARKING_METHOD # type: ignore # noqa: F401 +from .benchmark_testing import do_bench, assert_close, perf_report, Benchmark, USE_IPEX_OPTION, BENCHMARKING_METHOD # type: ignore # noqa: F401 if USE_IPEX_OPTION or BENCHMARKING_METHOD == "UPSTREAM_PYTORCH_PROFILER": from triton.runtime import driver diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py index 30f2467e97..1e088291fa 100644 --- a/benchmarks/triton_kernels_benchmark/benchmark_testing.py +++ b/benchmarks/triton_kernels_benchmark/benchmark_testing.py @@ -237,14 +237,6 @@ def extract_kernels(funcs): raise NotImplementedError(f"BENCHMARKING_METHOD: {BENCHMARKING_METHOD} isn't implemented") -def make_do_bench_for_autotune(): - - def autotuner_do_bench(*args, **kwargs): - return do_bench(*args, n_warmup=10, n_repeat=10, **kwargs) - - return autotuner_do_bench - - def assert_close(x, y, atol=None, rtol=None, err_msg=""): import numpy as np import torch diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py index ed63118ab0..132898c023 100644 --- a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py @@ -164,7 +164,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out, # for w in [8, 16, 32] \ ] -tuner = triton.autotune(configs, key=['N_CTX', 'BLOCK_DMODEL'], do_bench=benchmark_suit.make_do_bench_for_autotune()) +tuner = triton.autotune(configs, key=['N_CTX', 'BLOCK_DMODEL']) tune_attn_fwd = tuner(_attn_fwd) diff --git a/benchmarks/triton_kernels_benchmark/fused_softmax.py b/benchmarks/triton_kernels_benchmark/fused_softmax.py index 56cd91befe..6782e92d6b 100644 --- a/benchmarks/triton_kernels_benchmark/fused_softmax.py +++ b/benchmarks/triton_kernels_benchmark/fused_softmax.py @@ -50,7 +50,6 @@ def naive_softmax(x): triton.Config({"threads_per_warp": 16}, num_warps=4), ], key=["BLOCK_SIZE_X", "BLOCK_SIZE_Y"], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def softmax_kernel(output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, BLOCK_SIZE_X: tl.constexpr, diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py index a9b064714d..4ad3d8d5e5 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py @@ -43,7 +43,6 @@ num_stages=s, num_warps=32) for s in [2, 3] ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers( @@ -117,7 +116,6 @@ def matmul_kernel_with_block_pointers( num_stages=s, num_warps=4) for s in [2] ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers_batched( diff --git a/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py index 7d40709845..cefbd5abc9 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py @@ -35,7 +35,6 @@ num_stages=2, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers( @@ -110,7 +109,6 @@ def matmul_kernel_with_block_pointers( num_stages=2, num_warps=4), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers_batched( diff --git a/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py index 7ee5038b85..68cec3931e 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py @@ -54,7 +54,6 @@ def gelu(x): num_stages=2, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers( @@ -123,7 +122,6 @@ def matmul_kernel_with_block_pointers( num_stages=2, num_warps=4), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers_batched( diff --git a/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py index 6d821b4f30..dd5b57c84f 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py @@ -36,7 +36,6 @@ num_stages=2, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers( @@ -108,7 +107,6 @@ def matmul_kernel_with_block_pointers( num_stages=2, num_warps=4), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def matmul_kernel_with_block_pointers_batched( diff --git a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py index 8b354c8cd2..c4114c4466 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py @@ -15,7 +15,6 @@ num_stages=4, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def _kernel(A, B, C, # diff --git a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py index fa209319a1..f0743cfe64 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py @@ -107,7 +107,6 @@ def mac_loop( num_stages=2, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def first_wave( @@ -144,7 +143,6 @@ def first_wave( num_stages=2, num_warps=32), ], key=['M', 'N', 'K'], - do_bench=benchmark_suit.make_do_bench_for_autotune(), ) @triton.jit def full_tiles( diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py index 7e93086214..573d9d4191 100644 --- a/python/triton/runtime/autotuner.py +++ b/python/triton/runtime/autotuner.py @@ -357,7 +357,7 @@ def kernel(x_ptr, x_size, **META): def decorator(fn): return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, pre_hook=pre_hook, post_hook=post_hook, prune_configs_by=prune_configs_by, warmup=warmup, rep=rep, - use_cuda_graph=use_cuda_graph, do_bench=do_bench) + use_cuda_graph=use_cuda_graph) return decorator