Skip to content

Commit

Permalink
Add support for parallelization along the width for untilize with unp…
Browse files Browse the repository at this point in the history
…adding (#17538)

### Ticket
Link to Github Issue
#17537

### Problem description
Currently, the untilize with unpadding implementation supports
parallelization only along the height dimension. This affects perf for
wide tensors, as they are mapped to a limited number of cores.

### What's changed
In this PR, we introduce support for parallelizing the untiling
operation along the width dimension, similar to tilize with padding. The
operation executes the parallelization over the dimension with the
larger number of tiles.
In future versions: 
- we want the operation to support the parallelization along both
dimensions simultaneously
- we want the compute kernel to support the processing of an entire
column block at once instead of one tile at a time

For the tests added in test_to_layout.py, the kernel duration of the
previous implementation is around 1.8 to 24.8 times larger than the
current implementation 


### Checklist
- [x] Post commit CI passes
https://github.com/tenstorrent/tt-metal/actions/runs/13121055787
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
nardoTT authored Feb 5, 2025
1 parent fb0d2fa commit 1b01e8b
Show file tree
Hide file tree
Showing 6 changed files with 424 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,53 @@ def create_grid(x, y):


params = [
pytest.param([[5, 5, 32, 32]], untilize_with_unpadding_args)
for untilize_with_unpadding_args in generation_funcs.gen_untilize_with_unpadding_args([[5, 5, 32, 32]])
pytest.param(
[[5, 5, 32, 32]],
{
"dtype": [ttnn.bfloat16],
"layout": [ttnn.TILE_LAYOUT],
"input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
"output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1),
"output_tensor_end": [4, 4, 31, 28],
},
)
]

params += [
pytest.param([[5, 5, 64, 96]], untilize_with_unpadding_args)
for untilize_with_unpadding_args in generation_funcs.gen_untilize_with_unpadding_args([[5, 5, 64, 96]])
pytest.param(
[[5, 5, 64, 96]],
{
"dtype": [ttnn.bfloat16],
"layout": [ttnn.TILE_LAYOUT],
"input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
"output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
"output_tensor_end": [4, 4, 60, 90],
},
)
]

params += [
pytest.param(
[[1, 1, 128, 7328]],
[[5, 5, 64, 96]],
{
"dtype": [ttnn.bfloat16],
"layout": [ttnn.TILE_LAYOUT],
"input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
"input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)],
"output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
"output_tensor_end": [4, 4, 60, 90],
},
)
]


params += [
pytest.param(
[[1, 1, 128, 7328]],
{
"dtype": [ttnn.bfloat16],
"layout": [ttnn.TILE_LAYOUT],
"input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)],
"output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1),
"output_tensor_end": [0, 0, 119, 7299],
},
)
Expand Down
56 changes: 56 additions & 0 deletions tests/ttnn/unit_tests/test_to_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,59 @@ def test_to_layout_page_error(shape, device):
torch_output = torch_tensor
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.parametrize("shape", [[64, 7680]])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
def test_untilize_w1(shape, input_layout, output_layout, device):
torch.manual_seed(0)
input_a = torch.randn(shape, dtype=torch.bfloat16)

input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
output_tensor = ttnn.untilize_with_unpadding(input_tensor, [36, 7667])
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(input_a[:37, :7668], output_tensor)


@pytest.mark.parametrize("shape", [[2, 32, 6144]])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
def test_untilize_w2(shape, input_layout, output_layout, device):
torch.manual_seed(0)
input_a = torch.randn(shape, dtype=torch.bfloat16)

input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
output_tensor = ttnn.untilize_with_unpadding(input_tensor, [1, 30, 6140])
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(input_a[:, :31, :6141], output_tensor)


@pytest.mark.parametrize("shape", [[1, 1, 32, 1536]])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
def test_untilize_w3(shape, input_layout, output_layout, device):
torch.manual_seed(0)
input_a = torch.randn(shape, dtype=torch.bfloat16)

input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
output_tensor = ttnn.untilize_with_unpadding(input_tensor, [0, 0, 31, 1535])
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(input_a[:, :, :32, :1536], output_tensor)


@pytest.mark.parametrize("shape", [[1, 1, 32, 10912]])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
def test_untilize_w4(shape, input_layout, output_layout, device):
torch.manual_seed(0)
input_a = torch.randn(shape, dtype=torch.bfloat16)

input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
output_tensor = ttnn.untilize_with_unpadding(input_tensor, [0, 0, 0, 10911])
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(input_a[:, :, :1, :10912], output_tensor)
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "compute_kernel_api/untilize.h"
#include "debug/dprint.h"

namespace NAMESPACE {
void MAIN {
uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
uint32_t third_dim = get_compile_time_arg_val(2);
untilize_init(tt::CBIndex::c_0, tt::CBIndex::c_16);

uint32_t onetile = 1;
for (uint32_t b = 0; b < per_core_block_cnt * per_core_block_tile_cnt * third_dim; ++b) {
cb_wait_front(tt::CBIndex::c_0, onetile);
cb_reserve_back(tt::CBIndex::c_16, onetile);

untilize_block(tt::CBIndex::c_0, onetile, tt::CBIndex::c_16);

cb_push_back(tt::CBIndex::c_16, onetile);
cb_pop_front(tt::CBIndex::c_0, onetile);
}
}
} // namespace NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>

#include "dataflow_api.h"

void kernel_main() {
constexpr uint32_t cb_id_out0 = 16;

const uint32_t total_num_rows = get_compile_time_arg_val(3);
const uint32_t ncores = get_compile_time_arg_val(4);
const uint32_t third_dim = get_compile_time_arg_val(5);
const uint32_t tile_width = get_compile_time_arg_val(6);

const uint32_t dst_addr = get_arg_val<uint32_t>(0);
const uint32_t unpadded_X_size = get_arg_val<uint32_t>(1);
const uint32_t core_number = get_arg_val<uint32_t>(2);

constexpr bool dst0_is_dram = get_compile_time_arg_val(0) == 1;

#define stick_size_is_pow2 get_compile_time_arg_val(1) == 1
#if (stick_size_is_pow2)
constexpr uint32_t log_base_2_of_page_size = get_compile_time_arg_val(2);
const InterleavedPow2AddrGen<dst0_is_dram> s = {
.bank_base_address = dst_addr, .log_base_2_of_page_size = log_base_2_of_page_size};
#else
const InterleavedAddrGen<dst0_is_dram> s = {.bank_base_address = dst_addr, .page_size = unpadded_X_size};
#endif

auto write_block = [&](uint32_t num_rows,
uint32_t mul,
uint32_t size_per_row_per_block,
uint32_t start_id,
uint32_t width_size,
uint32_t size_2d) {
uint32_t onetile = 1;
bool has_rows = (num_rows) > 0;

cb_wait_front(cb_id_out0, onetile * has_rows);
uint32_t l1_read_addr = get_write_ptr(cb_id_out0);

for (uint32_t k = 0; k < num_rows; k++) {
uint64_t dst_noc_addr = get_noc_addr(size_2d + k, s);

uint32_t total_size = mul * size_per_row_per_block + start_id + width_size;
uint32_t padded_size = total_size - unpadded_X_size;
uint32_t write_size = width_size;

if (mul == ncores - 1 && padded_size > 0) {
write_size = width_size - padded_size;
}

noc_async_write(l1_read_addr, dst_noc_addr + start_id + mul * size_per_row_per_block, write_size);

noc_async_write_barrier();

if (k > 0 && (k % tile_width == 0)) {
cb_pop_front(cb_id_out0, onetile * has_rows);
cb_wait_front(cb_id_out0, onetile * has_rows);
}
l1_read_addr += width_size;
}

cb_pop_front(cb_id_out0, onetile * has_rows);
};

const uint32_t size_per_row_per_block = get_arg_val<uint32_t>(3);
const uint32_t blocks_per_core = get_arg_val<uint32_t>(4);
const uint32_t width_size = get_arg_val<uint32_t>(5);

uint32_t size_2d = 0;
for (uint32_t dim3 = 0; dim3 < third_dim; dim3++) {
uint32_t start_id = 0;
for (uint32_t b = 0; b < blocks_per_core; b++) {
write_block(total_num_rows, core_number, size_per_row_per_block, start_id, width_size, size_2d);
start_id += width_size;
}
size_2d += total_num_rows;
}
}
Loading

0 comments on commit 1b01e8b

Please sign in to comment.