#5174: Uplifitng microbenchmarks to run on BH

tenstorrent · Oct 24, 2024 · d33335d · d33335d
1 parent 80d61da
commit d33335d
Show file tree

Hide file tree

Showing 5 changed files with 227 additions and 9 deletions.
diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh
@@ -29,12 +29,12 @@ run_profiling_test() {
   pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_h2d_l1 -k $ARCH_NAME
   pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_pcie_d2h_l1 -k $ARCH_NAME
   # pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_noc -k $ARCH_NAME
-  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME
-  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME
+  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_dram -k $ARCH_NAME # how to set r and c for this
+  pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME # how to set r and c for this
 
-  if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
+  if [[ "$ARCH_NAME" != "grayskull" ]]; then
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
-    pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+    pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_all_core -k $ARCH_NAME
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
   fi
   # bypass wh_b0 for now until we can move FD cores to last col

diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
@@ -379,7 +379,7 @@ set_up_chdir() {
     # The user might have multiple entries in their PYTHONPATH so we should try to find the right one
     IFS=':' read -ra ENTRIES <<< "$PYTHONPATH"
     for ENTRY in "${ENTRIES[@]}"; do
-      if [[ $ENTRY == *"tt-metal" ]]; then
+      if [[ $ENTRY == *"third-metal" ]]; then
         cd $ENTRY
         return
       fi

diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
@@ -498,6 +498,7 @@ def test_pcie_d2h_dram(iteration, test_vector_small, test_vector_large):
     [
         ("grayskull", 2, 1048576, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
         ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
+        ("blackhole", 2, 1499136, np.array([4096, 16384, 65536, 262144, 1048576, 4194304, 16777216])),
     ],
 )
 def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
@@ -522,6 +523,7 @@ def test_pcie_h2d_l1(arch, iteration, L1_size, test_vector):
     [
         ("grayskull", 2, 1048576, np.array([4096, 16384, 65536])),
         ("wormhole_b0", 2, 1499136, np.array([4096, 16384, 65536])),
+        ("blackhole", 2, 1499136, np.array([4096, 16384, 65536])),
     ],
 )
 def test_pcie_d2h_l1(arch, iteration, L1_size, test_vector):
@@ -618,6 +620,16 @@ def test_matmul_dram(arch, freq, r, c, test_vector):
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
         ("wormhole_b0", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
+        # ########################### 512 512 512 x 8 subblock 4 2 ################################
+        ("blackhole", 1000, np.array([[512, 512, 512]]), 0, 0, 1, 8, 0, 0, 0, 0, 717089.0),
+        ("blackhole", 1000, np.array([[512, 512, 512]]), 0, 1, 1, 8, 0, 0, 0, 0, 1233930.0),
+        ("blackhole", 1000, np.array([[512, 512, 512]]), 0, 0, 1, 8, 1, 0, 0, 0, 664492.0),
+        ("blackhole", 1000, np.array([[512, 512, 512]]), 0, 1, 1, 8, 1, 0, 0, 0, 1173029.0),
+        # ########################### 512 512 256x8 subblock 4 2 ################################
+        ("blackhole", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 0, 0, 0, 0, 399068.0),
+        ("blackhole", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 0, 0, 0, 0, 658522.0),
+        ("blackhole", 1000, np.array([[512, 512, 256]]), 0, 0, 1, 8, 1, 0, 0, 0, 346350.0),
+        ("blackhole", 1000, np.array([[512, 512, 256]]), 0, 1, 1, 8, 1, 0, 0, 0, 597457.0),
     ],
 )
 def test_matmul_single_core_sharded(
@@ -684,9 +696,11 @@ def test_matmul_single_core_sharded(
     [
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
+        ("blackhole", 1000, np.array([32768, 8 * 128]), 1, 8, 0, 8, 0),
+        ("blackhole", 1000, np.array([32768, 8 * 128]), 1, 8, 1, 8, 0),
     ],
 )
-def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
+def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
     data = []
     cycle_list = []
     time_list = []
@@ -775,6 +789,12 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
         ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
         # multi layer multi receiver test
         ("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
+        # single layer single receiver test
+        ("blackhole", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
+        # single layer multi receiver test
+        ("blackhole", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
+        # multi layer multi receiver test
+        ("blackhole", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
     ],
 )
 def test_dram_read_remote_cb_sync(

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -683,17 +683,21 @@ int main(int argc, char** argv) {
 uint32_t get_l1_size(tt::ARCH arch) {
     constexpr uint32_t GS_L1_SIZE = 1048576;
     constexpr uint32_t WH_L1_SIZE = 1499136;
+    constexpr uint32_t BH_L1_SIZE = 1499136;
 
     uint32_t l1_size = 0;
     if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
         l1_size = WH_L1_SIZE;
     } else if (arch == tt::ARCH::GRAYSKULL) {
         l1_size = GS_L1_SIZE;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        l1_size = BH_L1_SIZE;
     }
     return l1_size;
 }
 
 double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_clock) {
+    constexpr double BH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.97;
     constexpr double WH_FPU_BFP8_TFLOPS_PER_TENSIX = 2.05;
     constexpr double GS_FPU_BFP8_TFLOPS_PER_TENSIX = 0.58;
 
@@ -706,6 +710,9 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
     } else if (arch == tt::ARCH::GRAYSKULL) {
         rpeak_tflops =
             GS_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        rpeak_tflops =
+            BH_FPU_BFP8_TFLOPS_PER_TENSIX * static_cast<double>(num_compute_core) * static_cast<double>(clock);
     }
 
     log_debug(LogTest, "Rpeak {} TFLOPS", rpeak_tflops);
@@ -776,7 +783,7 @@ CoreCoord get_core_range(
 std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch) {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
     bool fp32_dest_acc_en = false;
-    if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
+    if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0 || arch == tt::ARCH::BLACKHOLE) {
         math_fidelity = MathFidelity::HiFi2;
         // TODO: apply packer_l1_acc
         // TODO: need to consider whether to set these variablias as arguments

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -222,12 +222,15 @@ bool validation(
 uint32_t get_dram_bandwidth(tt::ARCH arch) {
     constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100;
     constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;
+    constexpr uint32_t BH_DRAM_BANDWIDTH_GB_PER_SEC = 512;
 
     uint32_t dram_bandwidth_gb_per_sec = 0;
     if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
         dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
     } else if (arch == tt::ARCH::GRAYSKULL) {
         dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
+    } else if (arch == tt::ARCH::BLACKHOLE) {
+        dram_bandwidth_gb_per_sec = BH_DRAM_BANDWIDTH_GB_PER_SEC;
     }
     return dram_bandwidth_gb_per_sec;
 }
@@ -512,6 +515,192 @@ void get_dram_reader_core_coords_wormhole_b0(
     all_cores_ordered = adj_core_logical_realloc;
 }
 
+void get_dram_reader_core_coords_blackhole(
+    tt_metal::Device* device, CoreRangeSet& all_cores, std::vector<CoreCoord>& all_cores_ordered) {
+    // hardcoded for bh
+    uint32_t full_grid_size_x = 17;
+    uint32_t full_grid_size_y = 12;
+    uint32_t x_step = 3;
+
+    // get all the logical coord
+    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+    // get dram banks and coords
+    uint32_t num_banks = device->num_dram_channels();
+    uint32_t max_bank_id = num_banks - 1;
+    std::vector<CoreCoord> dram_coord_phy; dram_coord_phy.reserve(num_banks);
+    for (int i = 0; i < num_banks; ++i) {
+        dram_coord_phy.push_back(device->dram_core_from_dram_channel(i));
+    }
+
+    // get worker logical coords
+    std::vector<CoreCoord> all_worker_cores_logical; all_worker_cores_logical.reserve(num_cores_x * num_cores_y);
+    for (int i = 0; i < num_cores_x; ++i) {
+        for (int j = 0; j < num_cores_y; ++j) {
+            all_worker_cores_logical.push_back(CoreCoord(i, j));
+        }
+    }
+
+    // get y coords of the workers
+    std::vector<uint32_t> all_worker_cores_y_physical; all_worker_cores_y_physical.reserve(num_cores_y);
+    uint32_t max_worker_y_physical = 0;
+    uint32_t min_worker_y_physical = 10000;
+    for (int i = 0; i < num_cores_y; ++i) {
+        auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i));
+        all_worker_cores_y_physical.push_back(core_phy.y);
+        if (core_phy.y > max_worker_y_physical) {
+            max_worker_y_physical = core_phy.y;
+        }
+        if (core_phy.y < min_worker_y_physical) {
+            min_worker_y_physical = core_phy.y;
+        }
+    }
+
+    // get the harvested rows, we treat dram and eth cores as harvested as well
+    std::vector<uint32_t> harvested_rows;
+    for (int i = 0; i < full_grid_size_y; ++i) {
+        auto y = i;
+
+        if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) ==
+            all_worker_cores_y_physical.end()) {
+            harvested_rows.push_back(y);
+        }
+    }
+
+    // get the ajacent cores of DRAM banks
+    std::vector<CoreCoord> adj_core_physical; adj_core_physical.reserve(num_banks);
+    for (int i = 0; i < num_banks; ++i) {
+        auto dram_core = dram_coord_phy[i];
+        uint32_t adj_core_x = dram_core.x + 1;
+        uint32_t adj_core_y = dram_core.y;
+        adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+    }
+
+    // split the adjacent coords into two groups, because DRAM banks has two cols
+    std::vector<CoreCoord> adj_core_physical_g1; adj_core_physical_g1.reserve(num_banks);
+    std::vector<size_t> adj_core_physical_y_g1; adj_core_physical_y_g1.reserve(num_banks);
+    std::vector<CoreCoord> adj_core_physical_g2; adj_core_physical_g2.reserve(num_banks);
+    std::vector<size_t> adj_core_physical_y_g2; adj_core_physical_y_g2.reserve(num_banks);
+    for (auto core : adj_core_physical) {
+        if (core.x == adj_core_physical.front().x) {
+            adj_core_physical_g1.push_back(core);
+        } else {
+            adj_core_physical_g2.push_back(core);
+        }
+    }
+    std::vector<int> indices_g1(adj_core_physical_g1.size());
+    std::vector<int> indices_g2(adj_core_physical_g2.size());
+    std::iota(indices_g1.begin(), indices_g1.end(), 0);
+    std::iota(indices_g2.begin(), indices_g2.end(), 0);
+    std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) {
+        return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y;
+    });
+    std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) {
+        return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y;
+    });
+    std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end());
+    std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end());
+
+    std::vector<int> indices_g1_realloc(adj_core_physical_g1.size());
+    std::vector<int> indices_g2_realloc(adj_core_physical_g2.size());
+    for (int new_index = 0; new_index < indices_g1.size(); ++new_index) {
+        indices_g1_realloc[indices_g1[new_index]] = new_index;
+    }
+    for (int new_index = 0; new_index < indices_g2.size(); ++new_index) {
+        indices_g2_realloc[indices_g2[new_index]] = new_index;
+    }
+
+    std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) {
+        return a.y < b.y;
+    });
+    std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) {
+        return a.y < b.y;
+    });
+    std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end());
+    std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end());
+
+    for (auto core : adj_core_physical_g1) {
+        adj_core_physical_y_g1.push_back(core.y);
+    }
+    for (auto core : adj_core_physical_g2) {
+        adj_core_physical_y_g2.push_back(core.y);
+    }
+
+    // move the workers, if they are on harvested rows
+    auto process_group = [&](std::vector<CoreCoord>& group, std::vector<size_t>& group_y, uint32_t x_step) {
+        for (auto& coord : group) {
+            auto y = coord.y;
+
+            if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() ||
+                std::count(group_y.begin(), group_y.end(), y) >= 2) {
+                auto adjust_coord = [&](int start, int end, int step) {
+                    bool found_new_row = false;
+                    for (int j = start; step > 0 ? j <= end : j >= end; j += step) {
+                        if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() &&
+                            std::count(group_y.begin(), group_y.end(), j) == 0) {
+                            coord.y = j;
+                            coord.x += x_step;
+                            x_step--;
+                            found_new_row = true;
+                            break;
+                        }
+                    }
+                    if (not found_new_row) {
+                        for (int j = start; step > 0 ? j <= end : j >= end; j += step) {
+                            if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) {
+                                coord.y = j;
+                                coord.x += x_step;
+                                x_step--;
+                                found_new_row = true;
+                                break;
+                            }
+                        }
+                    }
+                };
+
+                if (y >= max_bank_id) {
+                    adjust_coord(max_worker_y_physical, min_worker_y_physical, -1);
+                } else {
+                    adjust_coord(min_worker_y_physical, max_worker_y_physical, 1);
+                }
+            }
+        }
+    };
+    // move the workers, if they are on harvested rows
+    process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step);
+    process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step);
+
+    // merge two group into one
+    std::vector<CoreCoord> adj_core_physical_realloc; adj_core_physical_realloc.reserve(num_banks);
+    for (int i = 0; i < indices_g1_realloc.size(); ++i) {
+        adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]);
+    }
+    for (int i = 0; i < indices_g2_realloc.size(); ++i) {
+        adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]);
+    }
+
+    // find the logical coord from physical coord
+    std::vector<CoreCoord> adj_core_logical_realloc; adj_core_logical_realloc.reserve(num_banks);
+    for (int i = 0; i < adj_core_physical_realloc.size(); ++i) {
+        for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+            auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+            if (adj_core_physical_realloc[i] == core) {
+                adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+            }
+        }
+    }
+
+    // create sets
+    std::set<CoreRange> all_cores_set;
+    for (int i = 0; i < num_banks; ++i) {
+        all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+    }
+    all_cores = CoreRangeSet(all_cores_set);
+    all_cores_ordered = adj_core_logical_realloc;
+}
+
 int main(int argc, char **argv) {
     if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
         log_error("Test not supported w/ slow dispatch, exiting");
@@ -624,7 +813,7 @@ int main(int argc, char **argv) {
         tt_metal::Device *device = tt_metal::CreateDevice(device_id);
         dram_bandwidth_spec = get_dram_bandwidth(device->arch());
 
-        TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0");
+        TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0 or device->arch() == ARCH::BLACKHOLE, "device must be wh_b0 or bh");
 
         int clock_freq_mhz = get_tt_npu_clock(device);
 
@@ -636,8 +825,10 @@ int main(int argc, char **argv) {
         std::vector<CoreCoord> all_cores_list;
         if (device->arch() == tt::ARCH::WORMHOLE_B0) {
             get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
-        } else {
+        } else if (device->arch() == tt::ARCH::GRAYSKULL) {
             get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list);
+        } else if (device->arch() == tt::ARCH::BLACKHOLE) {
+            get_dram_reader_core_coords_blackhole(device, all_cores, all_cores_list);
         }
 
         uint32_t num_tiles_per_core = num_tiles / num_cores;