Skip to content

Commit

Permalink
Use new resnet test infra for GS (#12072)
Browse files Browse the repository at this point in the history
* #0: use new resnet test infra for GS

* #0: adjust 2cq e2e time

* #0: change to fn wrapper for gs and wh

* #0: update README

* #0: adjsut wh compile time
  • Loading branch information
yugaoTT authored Sep 3, 2024
1 parent 32777a5 commit f03bf93
Show file tree
Hide file tree
Showing 6 changed files with 365 additions and 56 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

| Model | Batch | End-to-end throughput [1] | Device throughput [2] | Target throughput |
|---------------------------------------------------------- |---------------------|------------------------------|-----------------------------|-------------------------------------|
| [ResNet-50](./models/demos/resnet) (fps) | 20 | 5,600 | 7,560 | 10,000 |
| [ResNet-50](./models/demos/ttnn_resnet) (fps) | 20 | 5,100 | 6,600 | 10,000 |
| [BERT-Large](./models/demos/bert) (sen/s) | 12 | 370 | 406 | 410 |
| [Falcon7B-decode](./models/demos/ttnn_falcon7b) (t/s) | 32 | 135 | 135 | 140 |
| [ViT](./models/demos/grayskull/vit) (fps) | 8 | 860 | 1570 | 2000 |
Expand Down
23 changes: 17 additions & 6 deletions models/demos/ttnn_resnet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,25 @@ To obtain a huggingface token visit: https://huggingface.co/docs/hub/security-to

### Single Device

#### Device Performance
+ To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./tt_metal/tools/profiler/profile_this.py -c "pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[16z-act_dtype0-weight_dtype0-math_fidelity0-device_params0]"`
+ This will generate a CSV report under `<this repo dir>/generated/profiler/reports/ops/<report name>`. The report file name is logged in the run output.
#### Grayskull Device Performance
+ To obtain device performance, run `pytest models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py::test_perf_device_bare_metal_gs`
+ This will run the model for 4 times and generate CSV reports under `<this repo dir>/generated/profiler/reports/ops/<report name>`. The report file name is logged in the run output.
+ It will also show a sumary of the device throughput in the run output.

#### End-to-End Performance
+ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal[16-0.004-25-device_params0]`.
#### Grayskull End-to-End Performance
+ For end-to-end performance, run `pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal_gs[20-0.004-5-device_params0]`.
+ This will generate a CSV with the timings and throughputs.
+ **Expected end-to-end perf**: For batch = 20, it is about `5,100 fps` currently. This may vary machine to machine.

#### Wormhole_B0 Device Performance
+ To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py::test_perf_device_bare_metal_wh`
+ This will run the model for 4 times and generate CSV reports under `<this repo dir>/generated/profiler/reports/ops/<report name>`. The report file name is logged in the run output.
+ It will also show a sumary of the device throughput in the run output.

#### Wormhole_B0 End-to-End Performance
+ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal_wh[16-0.004-25-device_params0]`.
+ This will generate a CSV with the timings and throughputs.
+ **Expected end-to-end perf**: For batch = 16, it is about `4100 fps` currently. This may vary machine to machine.
+ **Expected end-to-end perf**: For batch = 16, it is about `4,100 fps` currently. This may vary machine to machine.

### T3000
#### End-to-End Performance
Expand Down
36 changes: 31 additions & 5 deletions models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,35 @@

import pytest
from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
from models.utility_functions import run_for_wormhole_b0
from models.utility_functions import run_for_wormhole_b0, run_for_grayskull


@run_for_grayskull()
@pytest.mark.models_device_performance_bare_metal
@pytest.mark.parametrize(
"batch_size, test, expected_perf",
[
[20, "20-act_dtype0-weight_dtype0-math_fidelity0-device_params0", 6600],
],
)
def test_perf_device_bare_metal_gs(batch_size, test, expected_perf):
subdir = "resnet50"
num_iterations = 4
margin = 0.03
command = f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference_gs[{test}]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
expected_perf_cols = {inference_time_key: expected_perf}

post_processed_results = run_device_perf(command, subdir, num_iterations, cols, batch_size, True)
expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols)
prep_device_perf_report(
model_name=f"ttnn_resnet50_batch_size{batch_size}",
batch_size=batch_size,
post_processed_results=post_processed_results,
expected_results=expected_results,
comments=test,
)


@run_for_wormhole_b0()
Expand All @@ -15,13 +43,11 @@
[16, "16-act_dtype0-weight_dtype0-math_fidelity0-device_params0", 5020],
],
)
def test_perf_device_bare_metal(batch_size, test, expected_perf):
def test_perf_device_bare_metal_wh(batch_size, test, expected_perf):
subdir = "resnet50"
num_iterations = 4
margin = 0.03
command = (
f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[{test}]"
)
command = f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference_wh[{test}]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
expected_perf_cols = {inference_time_key: expected_perf}
Expand Down
129 changes: 124 additions & 5 deletions models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
)

from models.utility_functions import (
is_grayskull,
is_wormhole_b0,
profiler,
disable_persistent_kernel_cache,
run_for_wormhole_b0,
run_for_grayskull,
)

from models.demos.ttnn_resnet.tests.ttnn_resnet_test_infra import create_test_infra
Expand Down Expand Up @@ -368,14 +371,130 @@ def run_perf_resnet(
logger.info(f"{model_name} compile time: {compile_time}")


@run_for_grayskull()
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True)
@pytest.mark.parametrize(
"batch_size, expected_inference_time, expected_compile_time",
((20, 0.0080, 20),),
)
def test_perf_bare_metal_gs(
device,
use_program_cache,
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
model_location_generator,
):
run_perf_resnet(
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
device,
"resnet50",
model_location_generator,
)


@run_for_grayskull()
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224}], indirect=True)
@pytest.mark.parametrize(
"batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
(
(20, True, 0.0064, 10),
(20, False, 0.0064, 5),
),
indirect=["enable_async_mode"],
)
def test_perf_trace_bare_metal_gs(
device,
use_program_cache,
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
enable_async_mode,
model_location_generator,
):
mode = "async" if enable_async_mode else "sync"
run_perf_resnet(
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
device,
f"resnet50_trace_{mode}",
model_location_generator,
)


@run_for_grayskull()
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True)
@pytest.mark.parametrize(
"batch_size, expected_inference_time, expected_compile_time",
((20, 0.0100, 19),),
)
def test_perf_2cqs_bare_metal_gs(
device,
use_program_cache,
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
model_location_generator,
):
run_perf_resnet(
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
device,
"resnet50_2cqs",
model_location_generator,
)


@run_for_grayskull()
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224, "num_hw_cqs": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch_size, expected_inference_time, expected_compile_time",
((20, 0.004, 5),),
)
def test_perf_trace_2cqs_bare_metal_gs(
device,
use_program_cache,
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
model_location_generator,
):
run_perf_resnet(
batch_size,
expected_inference_time,
expected_compile_time,
hf_cat_image_sample_input,
device,
"resnet50_trace_2cqs",
model_location_generator,
)


@run_for_wormhole_b0()
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
@pytest.mark.parametrize(
"batch_size, expected_inference_time, expected_compile_time",
((16, 0.0070, 26),),
((16, 0.0070, 28),),
)
def test_perf_bare_metal(
def test_perf_bare_metal_wh(
device,
use_program_cache,
batch_size,
Expand Down Expand Up @@ -406,7 +525,7 @@ def test_perf_bare_metal(
),
indirect=["enable_async_mode"],
)
def test_perf_trace_bare_metal(
def test_perf_trace_bare_metal_wh(
device,
use_program_cache,
batch_size,
Expand Down Expand Up @@ -435,7 +554,7 @@ def test_perf_trace_bare_metal(
"batch_size, expected_inference_time, expected_compile_time",
((16, 0.0070, 26),),
)
def test_perf_2cqs_bare_metal(
def test_perf_2cqs_bare_metal_wh(
device,
use_program_cache,
batch_size,
Expand Down Expand Up @@ -464,7 +583,7 @@ def test_perf_2cqs_bare_metal(
"batch_size, expected_inference_time, expected_compile_time",
((16, 0.004, 25),),
)
def test_perf_trace_2cqs_bare_metal(
def test_perf_trace_2cqs_bare_metal_wh(
device,
use_program_cache,
batch_size,
Expand Down
Loading

0 comments on commit f03bf93

Please sign in to comment.