From f03bf935365f9a81203c53087fc76d2b3bb74c37 Mon Sep 17 00:00:00 2001 From: Yu Gao <145494740+yugaoTT@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:20:32 -0400 Subject: [PATCH] Use new resnet test infra for GS (#12072) * #0: use new resnet test infra for GS * #0: adjust 2cq e2e time * #0: change to fn wrapper for gs and wh * #0: update README * #0: adjsut wh compile time --- README.md | 2 +- models/demos/ttnn_resnet/README.md | 23 +- .../tests/test_perf_device_ttnn_resnet.py | 36 ++- .../tests/test_perf_ttnn_resnet.py | 129 ++++++++++- .../tests/test_ttnn_resnet50_performant.py | 215 +++++++++++++++--- tests/scripts/run_performance.sh | 16 +- 6 files changed, 365 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 7e993d1cc8f..f734e045213 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ | Model | Batch | End-to-end throughput [1] | Device throughput [2] | Target throughput | |---------------------------------------------------------- |---------------------|------------------------------|-----------------------------|-------------------------------------| -| [ResNet-50](./models/demos/resnet) (fps) | 20 | 5,600 | 7,560 | 10,000 | +| [ResNet-50](./models/demos/ttnn_resnet) (fps) | 20 | 5,100 | 6,600 | 10,000 | | [BERT-Large](./models/demos/bert) (sen/s) | 12 | 370 | 406 | 410 | | [Falcon7B-decode](./models/demos/ttnn_falcon7b) (t/s) | 32 | 135 | 135 | 140 | | [ViT](./models/demos/grayskull/vit) (fps) | 8 | 860 | 1570 | 2000 | diff --git a/models/demos/ttnn_resnet/README.md b/models/demos/ttnn_resnet/README.md index e8be0f3948d..4beed8bd285 100644 --- a/models/demos/ttnn_resnet/README.md +++ b/models/demos/ttnn_resnet/README.md @@ -28,14 +28,25 @@ To obtain a huggingface token visit: https://huggingface.co/docs/hub/security-to ### Single Device -#### Device Performance -+ To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./tt_metal/tools/profiler/profile_this.py -c "pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[16z-act_dtype0-weight_dtype0-math_fidelity0-device_params0]"` -+ This will generate a CSV report under `/generated/profiler/reports/ops/`. The report file name is logged in the run output. +#### Grayskull Device Performance ++ To obtain device performance, run `pytest models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py::test_perf_device_bare_metal_gs` ++ This will run the model for 4 times and generate CSV reports under `/generated/profiler/reports/ops/`. The report file name is logged in the run output. ++ It will also show a sumary of the device throughput in the run output. -#### End-to-End Performance -+ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal[16-0.004-25-device_params0]`. +#### Grayskull End-to-End Performance ++ For end-to-end performance, run `pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal_gs[20-0.004-5-device_params0]`. ++ This will generate a CSV with the timings and throughputs. ++ **Expected end-to-end perf**: For batch = 20, it is about `5,100 fps` currently. This may vary machine to machine. + +#### Wormhole_B0 Device Performance ++ To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py::test_perf_device_bare_metal_wh` ++ This will run the model for 4 times and generate CSV reports under `/generated/profiler/reports/ops/`. The report file name is logged in the run output. ++ It will also show a sumary of the device throughput in the run output. + +#### Wormhole_B0 End-to-End Performance ++ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal_wh[16-0.004-25-device_params0]`. + This will generate a CSV with the timings and throughputs. -+ **Expected end-to-end perf**: For batch = 16, it is about `4100 fps` currently. This may vary machine to machine. ++ **Expected end-to-end perf**: For batch = 16, it is about `4,100 fps` currently. This may vary machine to machine. ### T3000 #### End-to-End Performance diff --git a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py index 6ffa491f513..50ee10a95d3 100644 --- a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py +++ b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py @@ -4,7 +4,35 @@ import pytest from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report -from models.utility_functions import run_for_wormhole_b0 +from models.utility_functions import run_for_wormhole_b0, run_for_grayskull + + +@run_for_grayskull() +@pytest.mark.models_device_performance_bare_metal +@pytest.mark.parametrize( + "batch_size, test, expected_perf", + [ + [20, "20-act_dtype0-weight_dtype0-math_fidelity0-device_params0", 6600], + ], +) +def test_perf_device_bare_metal_gs(batch_size, test, expected_perf): + subdir = "resnet50" + num_iterations = 4 + margin = 0.03 + command = f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference_gs[{test}]" + cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] + inference_time_key = "AVG DEVICE KERNEL SAMPLES/S" + expected_perf_cols = {inference_time_key: expected_perf} + + post_processed_results = run_device_perf(command, subdir, num_iterations, cols, batch_size, True) + expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols) + prep_device_perf_report( + model_name=f"ttnn_resnet50_batch_size{batch_size}", + batch_size=batch_size, + post_processed_results=post_processed_results, + expected_results=expected_results, + comments=test, + ) @run_for_wormhole_b0() @@ -15,13 +43,11 @@ [16, "16-act_dtype0-weight_dtype0-math_fidelity0-device_params0", 5020], ], ) -def test_perf_device_bare_metal(batch_size, test, expected_perf): +def test_perf_device_bare_metal_wh(batch_size, test, expected_perf): subdir = "resnet50" num_iterations = 4 margin = 0.03 - command = ( - f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[{test}]" - ) + command = f"pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference_wh[{test}]" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] inference_time_key = "AVG DEVICE KERNEL SAMPLES/S" expected_perf_cols = {inference_time_key: expected_perf} diff --git a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py index 264bec76e72..047d1ad12e8 100644 --- a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py +++ b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py @@ -13,9 +13,12 @@ ) from models.utility_functions import ( + is_grayskull, + is_wormhole_b0, profiler, disable_persistent_kernel_cache, run_for_wormhole_b0, + run_for_grayskull, ) from models.demos.ttnn_resnet.tests.ttnn_resnet_test_infra import create_test_infra @@ -368,14 +371,130 @@ def run_perf_resnet( logger.info(f"{model_name} compile time: {compile_time}") +@run_for_grayskull() +@pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize( + "batch_size, expected_inference_time, expected_compile_time", + ((20, 0.0080, 20),), +) +def test_perf_bare_metal_gs( + device, + use_program_cache, + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + model_location_generator, +): + run_perf_resnet( + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device, + "resnet50", + model_location_generator, + ) + + +@run_for_grayskull() +@pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224}], indirect=True) +@pytest.mark.parametrize( + "batch_size, enable_async_mode, expected_inference_time, expected_compile_time", + ( + (20, True, 0.0064, 10), + (20, False, 0.0064, 5), + ), + indirect=["enable_async_mode"], +) +def test_perf_trace_bare_metal_gs( + device, + use_program_cache, + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + enable_async_mode, + model_location_generator, +): + mode = "async" if enable_async_mode else "sync" + run_perf_resnet( + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device, + f"resnet50_trace_{mode}", + model_location_generator, + ) + + +@run_for_grayskull() +@pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True) +@pytest.mark.parametrize( + "batch_size, expected_inference_time, expected_compile_time", + ((20, 0.0100, 19),), +) +def test_perf_2cqs_bare_metal_gs( + device, + use_program_cache, + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + model_location_generator, +): + run_perf_resnet( + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device, + "resnet50_2cqs", + model_location_generator, + ) + + +@run_for_grayskull() +@pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize( + "device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224, "num_hw_cqs": 2}], indirect=True +) +@pytest.mark.parametrize( + "batch_size, expected_inference_time, expected_compile_time", + ((20, 0.004, 5),), +) +def test_perf_trace_2cqs_bare_metal_gs( + device, + use_program_cache, + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + model_location_generator, +): + run_perf_resnet( + batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device, + "resnet50_trace_2cqs", + model_location_generator, + ) + + @run_for_wormhole_b0() @pytest.mark.models_performance_bare_metal @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) @pytest.mark.parametrize( "batch_size, expected_inference_time, expected_compile_time", - ((16, 0.0070, 26),), + ((16, 0.0070, 28),), ) -def test_perf_bare_metal( +def test_perf_bare_metal_wh( device, use_program_cache, batch_size, @@ -406,7 +525,7 @@ def test_perf_bare_metal( ), indirect=["enable_async_mode"], ) -def test_perf_trace_bare_metal( +def test_perf_trace_bare_metal_wh( device, use_program_cache, batch_size, @@ -435,7 +554,7 @@ def test_perf_trace_bare_metal( "batch_size, expected_inference_time, expected_compile_time", ((16, 0.0070, 26),), ) -def test_perf_2cqs_bare_metal( +def test_perf_2cqs_bare_metal_wh( device, use_program_cache, batch_size, @@ -464,7 +583,7 @@ def test_perf_2cqs_bare_metal( "batch_size, expected_inference_time, expected_compile_time", ((16, 0.004, 25),), ) -def test_perf_trace_2cqs_bare_metal( +def test_perf_trace_2cqs_bare_metal_wh( device, use_program_cache, batch_size, diff --git a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py index 3bba1fbe0fe..764bef2cd67 100644 --- a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py +++ b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py @@ -7,9 +7,8 @@ import ttnn from models.utility_functions import ( is_wormhole_b0, - divup, - skip_for_grayskull, ) +from models.utility_functions import run_for_wormhole_b0, run_for_grayskull from models.demos.ttnn_resnet.tests.ttnn_resnet_test_infra import create_test_infra try: @@ -20,13 +19,7 @@ use_signpost = False -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype, math_fidelity", - ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), -) -def test_run_resnet50_inference( +def run_resnet50_inference( device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator ): if batch_size == 8: @@ -61,14 +54,7 @@ def test_run_resnet50_inference( test_infra.validate() -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype, math_fidelity", - ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), -) -@pytest.mark.parametrize("enable_async", [True, False]) -def test_run_resnet50_trace_inference( +def run_resnet50_trace_inference( device, use_program_cache, batch_size, @@ -127,13 +113,7 @@ def test_run_resnet50_trace_inference( device.enable_async(False) -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype, math_fidelity", - ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), -) -def test_run_resnet50_2cqs_inference( +def run_resnet50_2cqs_inference( device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator ): if batch_size == 8: @@ -196,16 +176,7 @@ def test_run_resnet50_2cqs_inference( test_infra.validate(output) -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_hw_cqs": 2}], indirect=True -) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype, math_fidelity", - ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), -) -@pytest.mark.parametrize("enable_async", [True, False]) -def test_run_resnet50_trace_2cqs_inference( +def run_resnet50_trace_2cqs_inference( device, use_program_cache, batch_size, @@ -305,3 +276,179 @@ def test_run_resnet50_trace_2cqs_inference( test_infra.validate(output) device.enable_async(False) + + +@run_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((20, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +def test_run_resnet50_inference_gs( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator +): + run_resnet50_inference( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator + ) + + +@run_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((20, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +@pytest.mark.parametrize("enable_async", [True, False]) +def test_run_resnet50_trace_inference_gs( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, +): + run_resnet50_trace_inference( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, + ) + + +@run_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((20, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +def test_run_resnet50_2cqs_inference_gs( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator +): + run_resnet50_2cqs_inference( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator + ) + + +@run_for_grayskull() +@pytest.mark.parametrize( + "device_params", [{"l1_small_size": 32768, "trace_region_size": 1332224, "num_hw_cqs": 2}], indirect=True +) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((20, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +@pytest.mark.parametrize("enable_async", [True, False]) +def test_run_resnet50_trace_2cqs_inference_gs( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, +): + run_resnet50_trace_2cqs_inference( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, + ) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +def test_run_resnet50_inference_wh( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator +): + run_resnet50_inference( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator + ) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +@pytest.mark.parametrize("enable_async", [True, False]) +def test_run_resnet50_trace_inference_wh( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, +): + run_resnet50_trace_inference( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, + ) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +def test_run_resnet50_2cqs_inference_wh( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator +): + run_resnet50_2cqs_inference( + device, use_program_cache, batch_size, act_dtype, weight_dtype, math_fidelity, model_location_generator + ) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize( + "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_hw_cqs": 2}], indirect=True +) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype, math_fidelity", + ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), +) +@pytest.mark.parametrize("enable_async", [True, False]) +def test_run_resnet50_trace_2cqs_inference_wh( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, +): + run_resnet50_trace_2cqs_inference( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + math_fidelity, + enable_async, + model_location_generator, + ) diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index b6734f6b94b..cd728307b14 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -11,14 +11,18 @@ run_perf_models_other() { local tt_arch=$1 local test_marker=$2 - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py -m $test_marker + if [ "$tt_arch" == "grayskull" ]; then + env pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py -m $test_marker + fi + + if [ "$tt_arch" == "wormhole_b0" ]; then + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py -m $test_marker + fi env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker - env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker - env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker @@ -59,8 +63,6 @@ run_device_perf_models() { set -eo pipefail local test_marker=$1 - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests -m $test_marker - env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600 if [ "$tt_arch" == "grayskull" ]; then @@ -68,6 +70,8 @@ run_device_perf_models() { #Model Device perf regression tests to make sure thy run on no-soft-reset BMs tests/scripts/run_profiler_regressions.sh PROFILER_NO_RESET + env pytest models/demos/ttnn_resnet/tests -m $test_marker + env pytest models/demos/metal_BERT_large_11/tests -m $test_marker env pytest models/demos/ttnn_falcon7b/tests -m $test_marker --timeout=360 @@ -78,6 +82,8 @@ run_device_perf_models() { fi if [ "$tt_arch" == "wormhole_b0" ]; then + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mamba/tests -m $test_marker env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/metal_BERT_large_11/tests -m $test_marker