Skip to content

Commit

Permalink
#5174: Uplifitng microbenchmarks to run on BH
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Oct 24, 2024
1 parent 80d61da commit d33335d
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 9 deletions.
8 changes: 4 additions & 4 deletions tests/scripts/run_moreh_microbenchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ run_profiling_test() {
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_h2d_l1 -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_d2h_l1 -k $ARCH_NAME
# pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_noc -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME # how to set r and c for this
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME # how to set r and c for this

if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
if [[ "$ARCH_NAME" != "grayskull" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_all_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
fi
# bypass wh_b0 for now until we can move FD cores to last col
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ set_up_chdir() {
# The user might have multiple entries in their PYTHONPATH so we should try to find the right one
IFS=':' read -ra ENTRIES <<< "$PYTHONPATH"
for ENTRY in "${ENTRIES[@]}"; do
if [[ $ENTRY == *"tt-metal" ]]; then
if [[ $ENTRY == *"third-metal" ]]; then
cd $ENTRY
return
fi
Expand Down
22 changes: 21 additions & 1 deletion tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ def test_pcie_d2h_dram(iteration, test_vector_small, test_vector_large):
[
("grayskull", 2, 1048576, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
("blackhole", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
],
)
def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
Expand All @@ -522,6 +523,7 @@ def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
[
("grayskull", 2, 1048576, np.array([4096, 16384, 65536])),
("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536])),
("blackhole", 2, 1499136, np.array([4096, 16384, 65536])),
],
)
def test_pcie_d2h_l1(arch, iteration, L1_size, test_vector):
Expand Down Expand Up @@ -618,6 +620,16 @@ def test_matmul_dram(arch, freq, r, c, test_vector):
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
# ########################### 512 512 512 x 8 subblock 4 2 ################################
("blackhole", 1000, np.array([[512, 512, 512]]), 0, 0, 1, 8, 0, 0, 0, 0, 717089.0),
("blackhole", 1000, np.array([[512, 512, 512]]), 0, 1, 1, 8, 0, 0, 0, 0, 1233930.0),
("blackhole", 1000, np.array([[512, 512, 512]]), 0, 0, 1, 8, 1, 0, 0, 0, 664492.0),
("blackhole", 1000, np.array([[512, 512, 512]]), 0, 1, 1, 8, 1, 0, 0, 0, 1173029.0),
# ########################### 512 512 256x8 subblock 4 2 ################################
("blackhole", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 0, 0, 0, 0, 399068.0),
("blackhole", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
("blackhole", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
("blackhole", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
],
)
def test_matmul_single_core_sharded(
Expand Down Expand Up @@ -684,9 +696,11 @@ def test_matmul_single_core_sharded(
[
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
("blackhole", 1000, np.array([32768, 8 * 128]), 1, 8, 0, 8, 0),
("blackhole", 1000, np.array([32768, 8 * 128]), 1, 8, 1, 8, 0),
],
)
def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
data = []
cycle_list = []
time_list = []
Expand Down Expand Up @@ -775,6 +789,12 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
# multi layer multi receiver test
("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
# single layer single receiver test
("blackhole", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
# single layer multi receiver test
("blackhole", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
# multi layer multi receiver test
("blackhole", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
],
)
def test_dram_read_remote_cb_sync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -683,17 +683,21 @@ int main(int argc, char** argv) {
uint32_t get_l1_size(tt::ARCH arch) {
constexpr uint32_t GS_L1_SIZE = 1048576;
constexpr uint32_t WH_L1_SIZE = 1499136;
constexpr uint32_t BH_L1_SIZE = 1499136;

uint32_t l1_size = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
l1_size = WH_L1_SIZE;
} else if (arch == tt::ARCH::GRAYSKULL) {
l1_size = GS_L1_SIZE;
} else if (arch == tt::ARCH::BLACKHOLE) {
l1_size = BH_L1_SIZE;
}
return l1_size;
}

double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_clock) {
constexpr double BH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.97;
constexpr double WH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.05;
constexpr double GS_FPU_BFP8_TFLOPS_PER_TENSIX = 0.58;

Expand All @@ -706,6 +710,9 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
} else if (arch == tt::ARCH::GRAYSKULL) {
rpeak_tflops =
GS_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
} else if (arch == tt::ARCH::BLACKHOLE) {
rpeak_tflops =
BH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
}

log_debug(LogTest, "Rpeak {} TFLOPS", rpeak_tflops);
Expand Down Expand Up @@ -776,7 +783,7 @@ CoreCoord get_core_range(
std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch) {
MathFidelity math_fidelity = MathFidelity::HiFi4;
bool fp32_dest_acc_en = false;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0 || arch == tt::ARCH::BLACKHOLE) {
math_fidelity = MathFidelity::HiFi2;
// TODO: apply packer_l1_acc
// TODO: need to consider whether to set these variablias as arguments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,15 @@ bool validation(
uint32_t get_dram_bandwidth(tt::ARCH arch) {
constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100;
constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;
constexpr uint32_t BH_DRAM_BANDWIDTH_GB_PER_SEC = 512;

uint32_t dram_bandwidth_gb_per_sec = 0;
if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::GRAYSKULL) {
dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
} else if (arch == tt::ARCH::BLACKHOLE) {
dram_bandwidth_gb_per_sec = BH_DRAM_BANDWIDTH_GB_PER_SEC;
}
return dram_bandwidth_gb_per_sec;
}
Expand Down Expand Up @@ -512,6 +515,192 @@ void get_dram_reader_core_coords_wormhole_b0(
all_cores_ordered = adj_core_logical_realloc;
}

void get_dram_reader_core_coords_blackhole(
tt_metal::Device* device, CoreRangeSet& all_cores, std::vector<CoreCoord>& all_cores_ordered) {
// hardcoded for bh
uint32_t full_grid_size_x = 17;
uint32_t full_grid_size_y = 12;
uint32_t x_step = 3;

// get all the logical coord
auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
uint32_t num_cores_x = compute_with_storage_grid_size.x;
uint32_t num_cores_y = compute_with_storage_grid_size.y;

// get dram banks and coords
uint32_t num_banks = device->num_dram_channels();
uint32_t max_bank_id = num_banks - 1;
std::vector<CoreCoord> dram_coord_phy; dram_coord_phy.reserve(num_banks);
for (int i = 0; i < num_banks; ++i) {
dram_coord_phy.push_back(device->dram_core_from_dram_channel(i));
}

// get worker logical coords
std::vector<CoreCoord> all_worker_cores_logical; all_worker_cores_logical.reserve(num_cores_x * num_cores_y);
for (int i = 0; i < num_cores_x; ++i) {
for (int j = 0; j < num_cores_y; ++j) {
all_worker_cores_logical.push_back(CoreCoord(i, j));
}
}

// get y coords of the workers
std::vector<uint32_t> all_worker_cores_y_physical; all_worker_cores_y_physical.reserve(num_cores_y);
uint32_t max_worker_y_physical = 0;
uint32_t min_worker_y_physical = 10000;
for (int i = 0; i < num_cores_y; ++i) {
auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i));
all_worker_cores_y_physical.push_back(core_phy.y);
if (core_phy.y > max_worker_y_physical) {
max_worker_y_physical = core_phy.y;
}
if (core_phy.y < min_worker_y_physical) {
min_worker_y_physical = core_phy.y;
}
}

// get the harvested rows, we treat dram and eth cores as harvested as well
std::vector<uint32_t> harvested_rows;
for (int i = 0; i < full_grid_size_y; ++i) {
auto y = i;

if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) ==
all_worker_cores_y_physical.end()) {
harvested_rows.push_back(y);
}
}

// get the ajacent cores of DRAM banks
std::vector<CoreCoord> adj_core_physical; adj_core_physical.reserve(num_banks);
for (int i = 0; i < num_banks; ++i) {
auto dram_core = dram_coord_phy[i];
uint32_t adj_core_x = dram_core.x + 1;
uint32_t adj_core_y = dram_core.y;
adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
}

// split the adjacent coords into two groups, because DRAM banks has two cols
std::vector<CoreCoord> adj_core_physical_g1; adj_core_physical_g1.reserve(num_banks);
std::vector<size_t> adj_core_physical_y_g1; adj_core_physical_y_g1.reserve(num_banks);
std::vector<CoreCoord> adj_core_physical_g2; adj_core_physical_g2.reserve(num_banks);
std::vector<size_t> adj_core_physical_y_g2; adj_core_physical_y_g2.reserve(num_banks);
for (auto core : adj_core_physical) {
if (core.x == adj_core_physical.front().x) {
adj_core_physical_g1.push_back(core);
} else {
adj_core_physical_g2.push_back(core);
}
}
std::vector<int> indices_g1(adj_core_physical_g1.size());
std::vector<int> indices_g2(adj_core_physical_g2.size());
std::iota(indices_g1.begin(), indices_g1.end(), 0);
std::iota(indices_g2.begin(), indices_g2.end(), 0);
std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) {
return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y;
});
std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) {
return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y;
});
std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end());
std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end());

std::vector<int> indices_g1_realloc(adj_core_physical_g1.size());
std::vector<int> indices_g2_realloc(adj_core_physical_g2.size());
for (int new_index = 0; new_index < indices_g1.size(); ++new_index) {
indices_g1_realloc[indices_g1[new_index]] = new_index;
}
for (int new_index = 0; new_index < indices_g2.size(); ++new_index) {
indices_g2_realloc[indices_g2[new_index]] = new_index;
}

std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) {
return a.y < b.y;
});
std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) {
return a.y < b.y;
});
std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end());
std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end());

for (auto core : adj_core_physical_g1) {
adj_core_physical_y_g1.push_back(core.y);
}
for (auto core : adj_core_physical_g2) {
adj_core_physical_y_g2.push_back(core.y);
}

// move the workers, if they are on harvested rows
auto process_group = [&](std::vector<CoreCoord>& group, std::vector<size_t>& group_y, uint32_t x_step) {
for (auto& coord : group) {
auto y = coord.y;

if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() ||
std::count(group_y.begin(), group_y.end(), y) >= 2) {
auto adjust_coord = [&](int start, int end, int step) {
bool found_new_row = false;
for (int j = start; step > 0 ? j <= end : j >= end; j += step) {
if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() &&
std::count(group_y.begin(), group_y.end(), j) == 0) {
coord.y = j;
coord.x += x_step;
x_step--;
found_new_row = true;
break;
}
}
if (not found_new_row) {
for (int j = start; step > 0 ? j <= end : j >= end; j += step) {
if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) {
coord.y = j;
coord.x += x_step;
x_step--;
found_new_row = true;
break;
}
}
}
};

if (y >= max_bank_id) {
adjust_coord(max_worker_y_physical, min_worker_y_physical, -1);
} else {
adjust_coord(min_worker_y_physical, max_worker_y_physical, 1);
}
}
}
};
// move the workers, if they are on harvested rows
process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step);
process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step);

// merge two group into one
std::vector<CoreCoord> adj_core_physical_realloc; adj_core_physical_realloc.reserve(num_banks);
for (int i = 0; i < indices_g1_realloc.size(); ++i) {
adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]);
}
for (int i = 0; i < indices_g2_realloc.size(); ++i) {
adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]);
}

// find the logical coord from physical coord
std::vector<CoreCoord> adj_core_logical_realloc; adj_core_logical_realloc.reserve(num_banks);
for (int i = 0; i < adj_core_physical_realloc.size(); ++i) {
for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
if (adj_core_physical_realloc[i] == core) {
adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
}
}
}

// create sets
std::set<CoreRange> all_cores_set;
for (int i = 0; i < num_banks; ++i) {
all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
}
all_cores = CoreRangeSet(all_cores_set);
all_cores_ordered = adj_core_logical_realloc;
}

int main(int argc, char **argv) {
if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
log_error("Test not supported w/ slow dispatch, exiting");
Expand Down Expand Up @@ -624,7 +813,7 @@ int main(int argc, char **argv) {
tt_metal::Device *device = tt_metal::CreateDevice(device_id);
dram_bandwidth_spec = get_dram_bandwidth(device->arch());

TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0");
TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0 or device->arch() == ARCH::BLACKHOLE, "device must be wh_b0 or bh");

int clock_freq_mhz = get_tt_npu_clock(device);

Expand All @@ -636,8 +825,10 @@ int main(int argc, char **argv) {
std::vector<CoreCoord> all_cores_list;
if (device->arch() == tt::ARCH::WORMHOLE_B0) {
get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
} else {
} else if (device->arch() == tt::ARCH::GRAYSKULL) {
get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list);
} else if (device->arch() == tt::ARCH::BLACKHOLE) {
get_dram_reader_core_coords_blackhole(device, all_cores, all_cores_list);
}

uint32_t num_tiles_per_core = num_tiles / num_cores;
Expand Down

0 comments on commit d33335d

Please sign in to comment.