Skip to content

Commit

Permalink
Add perf report for reduce scatter async (#17223)
Browse files Browse the repository at this point in the history
### Ticket
#16648 

### Problem description
Provide context for the problem.

### What's changed

<img width="1299" alt="Screenshot 2025-01-28 at 5 41 07 PM"
src="https://github.com/user-attachments/assets/4c9353a0-d001-470d-8417-56cccb661ddd"
/>


### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
Aswinmcw authored and yieldthought committed Jan 31, 2025
1 parent cb581f1 commit b3213e4
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 15 deletions.
14 changes: 7 additions & 7 deletions tests/ttnn/unit_tests/operations/ccl/perf/async_perf_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def perf_report(file_path):
def remove_keys_from_attributes(attributes):
attributes = attributes.replace(";", ",").replace("'", '"')

keys_to_remove = ["receiver_device_id", "ring_index", "sender_device_id"]
keys_to_remove = ["receiver_device_id", "ring_index", "sender_device_id", "forward_device", "backward_device"]

try:
attributes_dict = eval(attributes)
Expand Down Expand Up @@ -63,7 +63,9 @@ def safe_parse_attributes(attributes):
)

df["num_links"] = df["ATTRIBUTES"].apply(
lambda x: safe_parse_attributes(x).get("num_links", "") if isinstance(safe_parse_attributes(x), dict) else ""
lambda x: safe_parse_attributes(x).get("num_links", safe_parse_attributes(x).get("num_links_preferred", ""))
if isinstance(safe_parse_attributes(x), dict)
else ""
)

df["output_mem_config"] = df["ATTRIBUTES"].apply(
Expand Down Expand Up @@ -164,12 +166,10 @@ def safe_parse_attributes(attributes):
op_code = averages_df.iloc[0]["OP CODE"]

today = time.strftime("%Y_%m_%d")
if op_code == "AllGather":
ccl_perf_file_path = f"CCL_all_gather_Perf_{today}.csv"
elif op_code == "AllGatherAsyn":
if op_code == "AllGatherAsync":
ccl_perf_file_path = f"CCL_all_gather_async_Perf_{today}.csv"
elif op_code == "ReduceScatter":
ccl_perf_file_path = f"CCL_reduce_scatter_Perf_{today}.csv"
elif op_code == "ReduceScatterAsync":
ccl_perf_file_path = f"CCL_reduce_scatter_async_Perf_{today}.csv"
else:
ccl_perf_file_path = f"CCL_Perf_{today}.csv"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/sh
MODULE_DIR="tests/ttnn/unit_tests/operations/ccl/perf"

# Defaults
DEBUG=false
TARGET="n300"

# Function to display help
show_help() {
echo "Usage: ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh [OPTIONS]"
echo
echo "Options:"
echo " -d, --debug Enable debug mode to show real-time output."
echo " -t, --target Specify the target configuration (t3000 or n300 or tg). Default is n300."
echo " -h, --help Display this help message."
echo
echo "Example:"
echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh --debug --target n300"
echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -h"
}

# Parse command-line arguments
while [ $# -gt 0 ]; do
case "$1" in
--debug|-d)
DEBUG=true
shift
;;
--help|-h)
show_help
exit 0
;;
--target|-t)
# Ensure there is an argument following the target flag
if [ -z "$2" ]; then
echo "Error: No target specified after $1."
show_help
exit 1
fi

TARGET="$2" # Set the target configuration
shift 2

# Validate the target value
if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "tg" ] && [ "$TARGET" != "n300" ]; then
echo "Error: Invalid target configuration: $TARGET. Must be 't3000' or 'n300' or 'tg'."
exit 1
fi
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done

# Function to run the profiling command and extract the CSV path
run_profile_and_extract_csv() {
command="./tt_metal/tools/profiler/profile_this.py -n reduce_scatter_async_$TARGET -c 'pytest tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_async_perf.py::test_reduce_scatter_async_$TARGET'"

if [ "$DEBUG" = true ]; then
echo "Running profiling command for target $TARGET in debug mode..."
full_output=$(eval $command 2>&1 | tee /dev/tty)
else
echo "Running profiling command for target $TARGET..."
full_output=$(eval $command 2>&1)
fi

# Extract the CSV path
csv_path=$(echo "$full_output" | grep -oE 'OPs csv generated at: (.+\.csv)' | sed -E 's/OPs csv generated at: //')

if [ -n "$csv_path" ]; then
echo "CSV path found: $csv_path"
echo "Generating performance report..."

tmp_file="/tmp/perf_report_output.log"
PYTHONPATH="$MODULE_DIR" python3 -c "
import sys
import pandas as pd
from async_perf_csv import perf_report
from tabulate import tabulate
try:
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print('Min - Avg - Max by Common Runs:')
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
except Exception as e:
print(f'Error in performance report generation: {e}', file=sys.stderr)
sys.exit(1)
" 2>&1 | tee "$tmp_file"

if grep -q "Error in performance report generation" "$tmp_file"; then
echo "Error: Performance report generation failed."
exit 1
fi

else
echo "CSV path not found in the command output."
exit 1
fi
}

# Run the function
run_profile_and_extract_csv
67 changes: 67 additions & 0 deletions tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_async_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from tests.ttnn.unit_tests.operations.ccl.test_new_all_gather import (
run_all_gather_impl,
)
from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_async import (
run_reduce_scatter_test,
)
from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import (
run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
)
Expand Down Expand Up @@ -141,3 +144,67 @@ def test_all_gather_async_tg(
teardown_persistent_fabric=True,
trace_mode=True,
)


@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
"num_devices, num_links",
[
(4, 1),
],
)
@pytest.mark.parametrize(
"per_chip_output_shape, dim, layout",
[
([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT),
],
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
],
)
@pytest.mark.parametrize(
"mem_config",
[
ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
],
)
@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
@pytest.mark.parametrize("enable_async", [False])
@pytest.mark.parametrize("trace_mode", [True])
@pytest.mark.parametrize("device_params", [{"trace_region_size": 1824800}], indirect=True)
def test_reduce_scatter_async_t3000(
t3k_mesh_device,
num_devices,
per_chip_output_shape,
dim,
num_links,
math_op,
input_dtype,
layout,
mem_config,
use_program_cache,
function_level_defaults,
enable_async,
trace_mode,
num_iters=20,
):
run_reduce_scatter_test(
t3k_mesh_device,
num_devices,
per_chip_output_shape,
dim,
num_links,
math_op,
input_dtype,
layout,
mem_config,
use_program_cache,
function_level_defaults,
num_iters=num_iters,
enable_async=enable_async,
topology=ttnn.Topology.Linear,
trace_mode=trace_mode,
)
22 changes: 14 additions & 8 deletions tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,22 @@ def run_with_trace(
output_mem_config,
num_iters=40,
topology=ttnn.Topology.Ring,
subdevice_id=None,
from_remote_semaphore_handles=None,
to_remote_semaphore_handles=None,
worker_sub_device_id=None,
):
# Compile Run
logger.info("Compiling model")
output_tensor_mesh = ttnn.reduce_scatter_async(
output_tensor_mesh = ttnn.experimental.reduce_scatter_async(
input_tensor_mesh,
dim=dim,
from_remote_multi_device_global_semaphore=from_remote_semaphore_handles,
to_remote_multi_device_global_semaphore=to_remote_semaphore_handles,
math_op=math_op,
num_links=num_links,
memory_config=output_mem_config,
topology=topology,
subdevice_id=subdevice_id,
create_semaphore_handles=True,
subdevice_id=worker_sub_device_id,
)
for device_id in t3k_mesh_device.get_device_ids():
ttnn.synchronize_device(t3k_mesh_device.get_device(device_id))
Expand All @@ -63,15 +66,16 @@ def run_with_trace(
logger.info("Capturing trace")
trace_id = ttnn.begin_trace_capture(t3k_mesh_device, cq_id=0)
for i in range(num_iters):
output_tensor_mesh = ttnn.reduce_scatter_async(
output_tensor_mesh = ttnn.experimental.reduce_scatter_async(
input_tensor_mesh,
dim=dim,
from_remote_multi_device_global_semaphore=from_remote_semaphore_handles,
to_remote_multi_device_global_semaphore=to_remote_semaphore_handles,
math_op=math_op,
num_links=num_links,
memory_config=output_mem_config,
topology=topology,
subdevice_id=subdevice_id,
create_semaphore_handles=False,
subdevice_id=worker_sub_device_id,
)
ttnn.end_trace_capture(t3k_mesh_device, trace_id, cq_id=0)
for device_id in t3k_mesh_device.get_device_ids():
Expand Down Expand Up @@ -223,7 +227,9 @@ def run_reduce_scatter_test(
output_mem_config,
num_iters=num_iters,
topology=topology,
subdevice_id=worker_sub_device_id,
from_remote_semaphore_handles=from_remote_semaphore_handles,
to_remote_semaphore_handles=to_remote_semaphore_handles,
worker_sub_device_id=worker_sub_device_id,
)
else:
logger.info(f"Running {num_iters} iterations of reduce scatter")
Expand Down

0 comments on commit b3213e4

Please sign in to comment.