From 7a6fa977298184caa476017de65b3e730c8b2ab5 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Wed, 12 Jun 2024 13:00:01 +0000
Subject: [PATCH 01/29] Fix core dumped and constraint issue

(cherry picked from commit 0cd7afda10dd32716637dccc58e679e4238491f6)
---
 pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py     | 2 +-
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
index 0c06fa39..a443c81a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -146,7 +146,7 @@ def test_yolox_onnx(variant, test_device):
             os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
 
             if variant == "yolox_l":
-
+                os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "0"
                 compiler_cfg.place_on_new_epoch("conv2d_372.dc.matmul.11")
                 compiler_cfg.balancer_op_override("concatenate_433.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
                 compiler_cfg.balancer_op_override(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index a4b00270..e11c1966 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -156,6 +156,7 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             compiler_cfg.enable_auto_transposing_placement = True
             compiler_cfg.enable_tm_cpu_fallback = True
             compiler_cfg.balancer_op_override("conv2d_328.dc.matmul.8", "grid_shape", (5,2))
+            os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "0"
         if size == "x":
             compiler_cfg.balancer_op_override("concatenate_363.dc.concatenate.0", "grid_shape", (1,1))
             compiler_cfg.balancer_op_override("conv2d_41.dc.matmul.8", "t_stream_shape", (1,1))

From e4e3b4007fdebdb0650e350ffab3e7695c2368af Mon Sep 17 00:00:00 2001
From: Darko Golubovic <dgolubovic@tenstorrent.com>
Date: Wed, 12 Jun 2024 14:56:16 +0000
Subject: [PATCH 02/29] Try padding with queue as fallback to padding with nop

(cherry picked from commit 890b40a17ed4785a0e2cbbc7815d9be135e5fca8)
---
 pybuda/csrc/passes/padding_pass_placer.cpp | 336 ++++++++++++---------
 pybuda/csrc/passes/padding_pass_placer.hpp |  15 +-
 2 files changed, 202 insertions(+), 149 deletions(-)

diff --git a/pybuda/csrc/passes/padding_pass_placer.cpp b/pybuda/csrc/passes/padding_pass_placer.cpp
index 073576ca..2f711b20 100644
--- a/pybuda/csrc/passes/padding_pass_placer.cpp
+++ b/pybuda/csrc/passes/padding_pass_placer.cpp
@@ -88,6 +88,8 @@ bool insert_queue_instead_of_nop(
     return true;
 }
 
+// Adds queue on output edge of the node and checks if node is legal. Returns true if node is legal after adding queue.
+// Before returning, we remove the queue from the graph.
 bool check_if_queue_fixes_failures(
     Graph *graph,
     Node *node,
@@ -135,28 +137,177 @@ bool node_has_padding_queue(
     return false;
 }
 
+bool run_padding_loop(
+    Graph *graph,
+    Node *node,
+    Padding &padding,
+    const BudaOpNodeLegalizerFailureInfo &failure_info,
+    const BalancerConfig &balancer_config,
+    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection,
+    bool queue_fixes_failures)
+{
+    const int PADDING_TRY_MAX = 10;
+    bool padded_node = false;
+    bool padded_loop = false;
+    bool no_failures = false;
+
+    std::uint32_t operand_access_cnt =
+        failure_info.getOpModelFailureCountByType(OpModelFailureReason::OperandAccessPreventsStreaming);
+    std::uint32_t operand_and_user_access_cnt =
+        failure_info.getOpModelFailureCountByType(OpModelFailureReason::OperandAndUserAccessPreventsStreaming);
+    std::uint32_t user_access_cnt =
+        failure_info.getOpModelFailureCountByType(OpModelFailureReason::UserAccessPreventsStreaming);
+    std::uint32_t buffer_alloc_cnt =
+        failure_info.getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure);
+
+    // allow padding loop at least once
+    bool first = true;
+    int padding_try_it = 0;
+    while (padding_try_it++ < PADDING_TRY_MAX && (buffer_alloc_cnt > 0 || first))
+    {
+        if (!first)
+        {
+            // If we have tried to pad the node (not first iteration of loop), but it failed, we remove padding and try
+            // again. Note, padding structure stays intact, and we resume padding in next iteration from where we have
+            // stopped.
+            remove_padding(graph, node, padding);
+        }
+        first = false;
+
+        if (operand_access_cnt)
+        {
+            // add nop on input edge of padded node to fix operand access prevents streaming failures
+            padding.add_nop_on_input_edge = true;
+        }
+        padded_loop = pad_node(graph, node, padding);
+
+        if (padded_loop)
+        {
+            std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
+                check_node_legality(graph, node, balancer_config, balancer_cache_collection);
+            user_access_cnt =
+                (failures.size() > 0)
+                    ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::UserAccessPreventsStreaming)
+                    : 0;
+            buffer_alloc_cnt =
+                (failures.size() > 0)
+                    ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure)
+                    : 0;
+            operand_access_cnt =
+                (failures.size() > 0)
+                    ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::OperandAccessPreventsStreaming)
+                    : 0;
+            operand_and_user_access_cnt = (failures.size() > 0)
+                                              ? failures[node].getOpModelFailureCountByType(
+                                                    OpModelFailureReason::OperandAndUserAccessPreventsStreaming)
+                                              : 0;
+            if (failures.size() > 0)
+            {
+                if (padded_loop)
+                    padded_loop = false;
+                log_debug(
+                    LogPadding,
+                    "Node {} is illegal after padding: lhs_rt {} lhs_ct {} rhs_ct {}",
+                    node->name(),
+                    padding.pad_lhs_rt,
+                    padding.pad_lhs_ct,
+                    padding.pad_rhs_ct);
+            }
+            else
+            {
+                no_failures = true;
+                log_debug(
+                    LogPadding,
+                    "Node {} is legal after padding: lhs_rt {} lhs_ct: {} rhs_ct: {}",
+                    node->name(),
+                    padding.pad_lhs_rt,
+                    padding.pad_lhs_ct,
+                    padding.pad_rhs_ct);
+                // If we added queue and also padded the node, we want to check if only adding the queue had solved
+                // the failures (queue_fixes_failures). If it did, we remove padding and keep the queue.
+                bool has_padding_queue = node_has_padding_queue(graph, node);
+                if (has_padding_queue && queue_fixes_failures)
+                {
+                    log_debug(
+                        LogPadding,
+                        "Node {} has padding queue on output edge but it's also padded. In this case only queue is "
+                        "enough.",
+                        node->name());
+                    remove_padding(graph, node, padding);
+                    insert_queue(graph, node);
+                    std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
+                        check_node_legality(graph, node, balancer_config, balancer_cache_collection);
+                    TT_ASSERT(failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
+                }
+                padded_node |= padded_loop;
+                break;
+            }
+        }
+    }
+
+    if (!no_failures)
+    {
+        // if we have failures after padding loop, we still have few things that can make the node legal.
+        // If queue_fixes_failures is true, this means that adding queue without padding the node fixes the failures.
+        if (queue_fixes_failures)
+        {
+            remove_padding(graph, node, padding);
+            insert_queue(graph, node);
+            std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
+                check_node_legality(graph, node, balancer_config, balancer_cache_collection);
+            TT_ASSERT(failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
+        }
+        else
+        {
+            // If we have failures, but  buffer_alloc_cnt == 0, then we can try to handle other failures by adding queue
+            // on output edge. In some cases this, along with padding will make node legal. However, if we don't even
+            // have buffer_alloc_cnt == 0, we remove padding and give up.
+            if (buffer_alloc_cnt > 0)
+            {
+                // After padding loop we still have input buffer allocation count issues.
+                log_warning(
+                    LogPadding,
+                    "Couldn't find padding for node: {} after {} iterations.",
+                    node->name(),
+                    padding_try_it);
+                // We remove padding only if it didn't solve input buffer allocation issues.
+                remove_padding(graph, node, padding);
+                // unsuccessful padding for node
+                return false;
+            }
+
+            if (user_access_cnt > 0 || operand_and_user_access_cnt > 0)
+            {
+                // Inserting queue helps with user access failures but can also solve input buffer allocation issues.
+                insert_queue(graph, node);
+                padded_node = true;
+            }
+        }
+    }
+    return padded_node;
+}
+
+// Tries padding all nodes from map nodes_to_pad. Returns true if it successfully padded at least one node.
 bool pad_pass_placer(
     Graph *graph,
-    const std::unordered_map<graphlib::Node *, 
-    const BudaOpNodeLegalizerFailureInfo> &nodes_to_pad,
+    const std::unordered_map<graphlib::Node *, const BudaOpNodeLegalizerFailureInfo> &nodes_to_pad,
     const balancer::BalancerConfig &balancer_config,
     std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
 {
-    const int PADDING_TRY_MAX = 10;
     bool padded = false;
 
-    // We pass operations we want to pad, in other words if paddings map is not empty,
-    // for each operations in our graph we check if it should be padded or not.
-    // So, it should exist in the map and its flag should be TRUE, otherwise we skip the node.
-
+    // We go through map of nodes to pad. These can be nodes without valid op models, and/or user defined nodes that
+    // need to be padded.
     for (const auto &node_fail_pair : nodes_to_pad)
     {
-
-        Node* node = node_fail_pair.first;
+        Node *node = node_fail_pair.first;
         const BudaOpNodeLegalizerFailureInfo failure_info = node_fail_pair.second;
         log_debug(LogPadding, "Padding node {} with {}", node->name(), failure_info.toString().c_str());
 
-        // If the node has no valid grids and has padding_nop tag, we replace it with buffering queue.
+        // If the node has no valid grids and has padding_nop tag it means that we padded nop's producer and got valid
+        // op models, but nop that we added on output edge took the burdain of constraints and we just postponed the no
+        // valid grids issue to the next node (padding_nop). This is solved by replacing padding nop with the buffering
+        // queue.
         if (node->as<graphlib::TaggedNode>()->has_tag("padding_nop"))
         {
             std::vector<Node *> oprands = graph->data_operands(node);
@@ -170,127 +321,37 @@ bool pad_pass_placer(
 
         if (node->as<graphlib::TaggedNode>()->has_tag("padding"))
             continue;
-        std::uint32_t operand_access_cnt =
-            failure_info.getOpModelFailureCountByType(OpModelFailureReason::OperandAccessPreventsStreaming);
-        std::uint32_t operand_and_user_access_cnt =
-            failure_info.getOpModelFailureCountByType(OpModelFailureReason::OperandAndUserAccessPreventsStreaming);
-        std::uint32_t operand_and_user_access_cnt_begin = operand_and_user_access_cnt;
-
-        std::uint32_t user_access_cnt =
-            failure_info.getOpModelFailureCountByType(OpModelFailureReason::UserAccessPreventsStreaming);
-        std::uint32_t user_access_cnt_begin = user_access_cnt;
-
-        std::uint32_t buffer_alloc_cnt =
-            failure_info.getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure);
-
-        int padding_try_it = 0;
-        bool padded_loop = false;
 
+        // Initialize padding structure.
         Padding padding;
-        // Preserve the original shape
-        padding.orig_shape = node->shape();
-        bool no_failures = false;
-        bool add_nop_on_input_edge = false;
+        // Preserve the original shape.
+        tt::graphlib::Shape orig_shape = node->shape();
+        padding.orig_shape = orig_shape;
 
-        // Check if adding queue after the node fixes failures. After padding loop is done, we will decide whether it is
-        // better to pad the node or just to add queue.
+        // Check if adding queue after the node fixes all the failures. After padding loop is done, we will decide
+        // whether it is better to pad the node or just to add queue.
         bool queue_fixes_failures =
             check_if_queue_fixes_failures(graph, node, balancer_config, balancer_cache_collection);
         log_trace(LogPadding, "For node {}, queue after node fixes failures: {}", node->name(), queue_fixes_failures);
-        // allow padding loop at least once
-        bool first = true;
-        while (padding_try_it++ < PADDING_TRY_MAX && ( buffer_alloc_cnt > 0 || first ))
-        {
-            first = false;
-            if (padding_try_it > 0)
-            {
-                // If we have tried to pad the node, but it failed, we remove padding and try again.
-                // Note, padding structure stays intact, and we resume padding in next iteration from where we have stopped.
-                remove_padding(graph, node, padding);
-            }
-
-            if (operand_access_cnt)
-            {
-                // add nop on input edge of padded node to fix operand access prevents streaming failures
-                add_nop_on_input_edge = true;
-            }
-            padded_loop = pad_node(graph, node, padding, add_nop_on_input_edge);
-            
-            if (padded_loop) 
-            {
-
-                std::unordered_map<Node*, const BudaOpNodeLegalizerFailureInfo> failures = check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-                // user_access_cnt shouldn't be updated in padding loop because it is only used after the loop if there are still failures and graph is unpadded.
-                user_access_cnt = (failures.size() > 0) ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::UserAccessPreventsStreaming) : 0;
-                buffer_alloc_cnt = (failures.size() > 0) ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure) : 0;
-                operand_access_cnt = (failures.size() > 0) ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::OperandAccessPreventsStreaming): 0;
-                operand_and_user_access_cnt = (failures.size() > 0) ? failures[node].getOpModelFailureCountByType(OpModelFailureReason::OperandAndUserAccessPreventsStreaming): 0;
-                if (failures.size() > 0)
-                {
-                    if (padded_loop)
-                        padded_loop = false;
-                    log_debug(LogPadding, "Node {} is illegal after padding: lhs_rt {} lhs_ct {} rhs_ct {}", node->name(), padding.pad_lhs_rt, padding.pad_lhs_ct, padding.pad_rhs_ct);
-                }
-                else
-                {
-                    no_failures = true;
-                    log_debug(LogPadding, "Node {} is legal after padding: lhs_rt {} lhs_ct: {} rhs_ct: {}", node->name(), padding.pad_lhs_rt, padding.pad_lhs_ct, padding.pad_rhs_ct);
-                    // If we added queue and also padded the node, we want to check if only adding the queue had solved
-                    // the failures (queue_fixes_failures). If it did, we remove padding and keep the queue.
-                    bool has_padding_queue = node_has_padding_queue(graph, node);
-                    if (has_padding_queue && queue_fixes_failures)
-                    {
-                        remove_padding(graph, node, padding);
-                        insert_queue(graph, node);
-                        std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
-                            check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-                        TT_ASSERT(
-                            failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
-                    }
-                    padded |= padded_loop;
-                    break;
-                }
-            }
-
-        }
 
+        // Try padding the node with nop on output edges.
+        bool padded_node = run_padding_loop(
+            graph, node, padding, failure_info, balancer_config, balancer_cache_collection, queue_fixes_failures);
 
-        if (!no_failures)
+        if (!padded_node)
         {
-            // if we have failures after padding loop, we still have few things that can make the node legal.
-            // If queue_fixes_failures is true, this means that adding queue without padding the node fixes the failures.
-            if (queue_fixes_failures)
-            {
-                remove_padding(graph, node, padding);
-                insert_queue(graph, node);
-                std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
-                    check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-                TT_ASSERT(
-                    failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
-            }
-            else
-            {
-                // If queue doesn't fix the failures, we try to add 
-                std::uint32_t user_access_cnt_final = (buffer_alloc_cnt > 0) ? user_access_cnt_begin : user_access_cnt;
-                std::uint32_t operand_and_user_access_cnt_final = (buffer_alloc_cnt > 0) ? operand_and_user_access_cnt_begin : operand_and_user_access_cnt;
-
-                if (buffer_alloc_cnt > 0)
-                {
-                    // After padding loop we still have input buffer allocation count issues.
-                    log_warning(LogPadding, "Couldn't find padding for node: {} after {} iterations.", node->name(), padding_try_it);
-                    // We remove padding only if it didn't solve input buffer allocation issues.
-                    remove_padding(graph, node, padding);
-                }
-
-                if (user_access_cnt_final > 0 || operand_and_user_access_cnt_final > 0)
-                {
-                    // Inserting queue helps with user access failures but can also solve input buffer allocation issues.
-                    insert_queue(graph, node);
-                    padded = true;
-                }
-            }
+            // Reset the padding structure and try again with padding.add_queues_on_output = true.
+            // This means that for the current node we will try padding again but with adding queue on output edge
+            // instead of nop.
+            padding = Padding();
+            padding.orig_shape = orig_shape;
+            padding.add_queues_on_output = true;
+            padded_node = run_padding_loop(
+                graph, node, padding, failure_info, balancer_config, balancer_cache_collection, queue_fixes_failures);
         }
 
+        // padded will be true if we padded successfully at least one node from the nodes_to_pad.
+        padded |= padded_node;
     }
 
     return padded;
@@ -566,8 +627,7 @@ void remove_buda_unpad(Graph *graph, Node *node)
 bool pad_node(
     Graph *graph, 
     Node *node,
-    Padding &padding,
-    bool add_nop_on_input_edge
+    Padding &padding
 )
 {
 
@@ -604,7 +664,7 @@ bool pad_node(
         if (element_wise_flag && op_type != "splice")
         {
             compute_pad_eltwise(node, padding, criterion);
-            return pad_eltwise(graph, node, padding, add_nop_on_input_edge);
+            return pad_eltwise(graph, node, padding);
         }
 
         /* TODO: Should be enabled.
@@ -620,14 +680,14 @@ bool pad_node(
         if (buda_op_node->is_sparse_matmul() && sparse_matmul_flag)
         {
             compute_pad_smm(graph, node, padding, criterion);
-            return pad_smm(graph, node, padding, add_nop_on_input_edge);
+            return pad_smm(graph, node, padding);
         }
 
         // Pad matmul
         if (buda_op_node->is_matmul() && matmul_flag)
         {
             compute_pad_matmul(graph, node, padding, criterion);
-            return pad_matmul(graph, node, padding, add_nop_on_input_edge);
+            return pad_matmul(graph, node, padding);
         }
 
     }  // end if, matmul
@@ -654,8 +714,7 @@ void set_padded_node_out_shape(Node* padded_node, Padding &padding)
 bool pad_eltwise(
     Graph *graph, 
     Node *node,
-    Padding &padding,
-    bool add_nop_on_input_edge
+    Padding &padding
 )
 {
 
@@ -692,7 +751,7 @@ bool pad_eltwise(
                 // Padding value, used only in case
                 // when we use buda implmentation for padding
                 0.0,
-                add_nop_on_input_edge
+                padding.add_nop_on_input_edge
             );
 
     }  // end for, incoming edges
@@ -720,8 +779,7 @@ bool pad_eltwise(
 bool pad_matmul(
     Graph *graph, 
     Node *node,
-    Padding &padding,
-    bool add_nop_on_input_edge
+    Padding &padding
 )
 {
 
@@ -760,7 +818,7 @@ bool pad_matmul(
             // Padding value, used only in case
             // when we use buda implmentation for padding
             0.0,
-            add_nop_on_input_edge);
+            padding.add_nop_on_input_edge);
     }
 
     // Insert pad for the right operand
@@ -775,7 +833,7 @@ bool pad_matmul(
             // Padding value, used only in case
             // when we use buda implmentation for padding
             0.0,
-            add_nop_on_input_edge);
+            padding.add_nop_on_input_edge);
     }
 
     // If matmul has bias operand (incoming_edges.size() > 2), we update tms on the bias edge to include padding.
@@ -803,12 +861,7 @@ bool pad_matmul(
     return true;
 }
 
-bool pad_smm(
-    Graph *graph, 
-    Node *node, 
-    Padding &padding,
-    bool add_nop_on_input_edge
-)
+bool pad_smm(Graph *graph, Node *node, Padding &padding)
 {
     bool padded = false;
 
@@ -871,7 +924,7 @@ bool pad_smm(
                     // Padding value, used only in case
                     // when we use buda implmentation for padding
                     0.0,
-                    add_nop_on_input_edge
+                    padding.add_nop_on_input_edge
                 );
 
                 padded = true;
@@ -1222,7 +1275,7 @@ void insert_unpad(
             (PortId)edge.consumer_input_port_id);
         std::get<1>(new_queue_info)->as<TaggedNode>()->add_tags({ { "padding_queue", true } });
     }
-    else if (padding.added_nop.find(edge.consumer_node_id) != padding.added_nop.end())
+    else if (padding.add_queues_on_output)
     {
         insert_unpad_buda(
             graph,
@@ -1271,9 +1324,6 @@ void insert_unpad(
             // Original shape C dimension
             orig_c
         );
-
-        // Set the NOP indicator.
-        padding.added_nop.insert(outgoing_node->id());
     }
 
 }
diff --git a/pybuda/csrc/passes/padding_pass_placer.hpp b/pybuda/csrc/passes/padding_pass_placer.hpp
index 2bc448f1..92874a2d 100644
--- a/pybuda/csrc/passes/padding_pass_placer.hpp
+++ b/pybuda/csrc/passes/padding_pass_placer.hpp
@@ -43,8 +43,11 @@ struct Padding
     std::uint32_t pad_lhs_ct = 0;
     std::uint32_t pad_rhs_ct = 0;
 
-    // If we have added nop input edge of the node with NodeId map will hold true for that NodeId.
-    std::unordered_set<tt::graphlib::NodeId> added_nop;
+    // If true, we will add nop on input edge of padded node. This is done to solve operand constraints.
+    bool add_nop_on_input_edge = false;
+
+    // If true, we will add buffering queue on output edge of padded node instead of adding nop.
+    bool add_queues_on_output = false;
 };
 
 // Padding criterion says how we want to compute
@@ -115,13 +118,13 @@ void remove_pad_tm(Graph *, Node *);
 
 void remove_buda_unpad(tt::graphlib::Graph *, tt::graphlib::Node *);
 
-bool pad_node(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, bool add_nop_on_input_edge = false);
+bool pad_node(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
 
-bool pad_eltwise(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, bool add_nop_on_input_edge = false);
+bool pad_eltwise(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
 
-bool pad_matmul(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, bool add_nop_on_input_edge = false);
+bool pad_matmul(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
 
-bool pad_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, bool add_nop_on_input_edge = false);
+bool pad_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
 
 // TODO: In Progress.
 // bool pad_splice(tt::graphlib::Graph *, tt::graphlib::Node *);

From 0cb7c5d850ec24646cb4d5fe8b2dbf0f44a9862a Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 13 Jun 2024 07:14:14 +0000
Subject: [PATCH 03/29] Shape definitions and calculation of operand inputs in
 RGG

(cherry picked from commit 74c69ce3de4da34c868f0fbac6857e7b9ddc4282)
---
 pybuda/pybuda/op_repo/datatypes.py            |   8 +-
 pybuda/pybuda/op_repo/pybuda_operators.py     |  17 ++-
 pybuda/pybuda/op_repo/pytorch_operators.py    |  25 ++-
 pybuda/pybuda/op_repo/shapes.py               |  59 +++++++
 pybuda/test/README.debug.md                   |   4 +-
 pybuda/test/random/rgg/__init__.py            |   4 +-
 pybuda/test/random/rgg/algorithms.py          | 144 +++++++++++++-----
 pybuda/test/random/rgg/base.py                |  28 +---
 pybuda/test/random/rgg/config.py              |   9 +-
 pybuda/test/random/rgg/datatypes.py           |  15 +-
 .../random/rgg/pybuda/generated_model.jinja2  |  14 +-
 .../random/rgg/pytorch/generated_model.jinja2 |  14 +-
 pybuda/test/random/rgg/utils.py               |  39 ++---
 pybuda/test/random/test_three_ops.py          |   4 +-
 14 files changed, 273 insertions(+), 111 deletions(-)
 create mode 100644 pybuda/pybuda/op_repo/shapes.py

diff --git a/pybuda/pybuda/op_repo/datatypes.py b/pybuda/pybuda/op_repo/datatypes.py
index 9a5ce740..9f7fd184 100644
--- a/pybuda/pybuda/op_repo/datatypes.py
+++ b/pybuda/pybuda/op_repo/datatypes.py
@@ -4,10 +4,15 @@
 # Operator repository models
 
 
-from typing import List, Optional, Callable, Type, Union
+from random import Random
+from typing import List, Tuple, Optional, Callable, Type, Union
 from dataclasses import dataclass, field
 
 
+# Defining a type for tensor shape
+TensorShape = Tuple[int, ...]
+
+
 @dataclass
 class OperatorParamNumber:
     name: str
@@ -29,6 +34,7 @@ class OperatorDefinition:
     forward_code: Optional[Callable[[], str]] = None
     forward_params: List[OperatorParam] = field(default_factory=list)
     operands: List[str] = field(default_factory=list)  # TODO describe operand and shapes
+    calc_input_shapes: Optional[Callable[["OperatorDefinition", TensorShape, Random], List[TensorShape]]] = None  # calculate input shapes from output shape
 
     def is_operator(self) -> bool:
         return not self.instantiate
diff --git a/pybuda/pybuda/op_repo/pybuda_operators.py b/pybuda/pybuda/op_repo/pybuda_operators.py
index b342bd82..2da088f0 100644
--- a/pybuda/pybuda/op_repo/pybuda_operators.py
+++ b/pybuda/pybuda/op_repo/pybuda_operators.py
@@ -6,23 +6,24 @@
 
 from .datatypes import OperatorDefinition, OperatorRepository
 from .datatypes import OperatorParamNumber
+from .shapes import same_input_shapes
+from .shapes import matmul_inputs
 
 
 # TODO describe operand and shapes
 _OPERATORS = [
-    OperatorDefinition("relu", "pybuda.op.Relu", 1),
-    OperatorDefinition("sqrt", "pybuda.op.Sqrt", 1),
-    OperatorDefinition("tanh", "pybuda.op.Tanh", 1),
-    OperatorDefinition("exp", "pybuda.op.Exp", 1),
+    OperatorDefinition("relu", "pybuda.op.Relu", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("tanh", "pybuda.op.Tanh", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("exp", "pybuda.op.Exp", 1, calc_input_shapes=same_input_shapes),
     OperatorDefinition("pow", "pybuda.op.Pow", 1, forward_params=[
         # float exponent is currently not supported due to issue #2592
         # OperatorParamNumber("exponent", float, 0, 100),
         OperatorParamNumber("exponent", int, 0, 100),
-    ]),
-    OperatorDefinition("add", "pybuda.op.Add", 2),
+    ], calc_input_shapes=same_input_shapes),
+    OperatorDefinition("add", "pybuda.op.Add", 2, calc_input_shapes=same_input_shapes),
 
-    OperatorDefinition("matmul", "pybuda.op.Matmul", 2),
-    OperatorDefinition("eltwise", "pybuda.op.Add", 2),
+    OperatorDefinition("matmul", "pybuda.op.Matmul", 2, calc_input_shapes=matmul_inputs),
+    OperatorDefinition("eltwise", "pybuda.op.Add", 2, calc_input_shapes=same_input_shapes),
 ]
 
 
diff --git a/pybuda/pybuda/op_repo/pytorch_operators.py b/pybuda/pybuda/op_repo/pytorch_operators.py
index c3c23b5f..ef78c878 100644
--- a/pybuda/pybuda/op_repo/pytorch_operators.py
+++ b/pybuda/pybuda/op_repo/pytorch_operators.py
@@ -6,21 +6,30 @@
 
 from .datatypes import OperatorDefinition, OperatorRepository
 from .datatypes import OperatorParamNumber
+from .shapes import same_input_shapes
+from .shapes import linear_inputs
+from .shapes import conv2d_inputs
+from .shapes import matmul_inputs
 
 
 # TODO describe operand and shapes
 _OPERATORS = [
-    OperatorDefinition("linear", "torch.nn.Linear", 1, instantiate=True),
+    OperatorDefinition("linear", "torch.nn.Linear", 1, instantiate=True, constructor_params=[
+        OperatorParamNumber("in_features", int, 10, 50),
+        OperatorParamNumber("out_features", int, 10, 50),
+    ], calc_input_shapes=linear_inputs),
     OperatorDefinition("conv2d", "torch.nn.Conv2d", 1, instantiate=True, constructor_params=[
+        OperatorParamNumber("in_channels", int, 10, 50),
+        OperatorParamNumber("out_channels", int, 10, 50),
         OperatorParamNumber("kernel_size", int, 3, 3),
         OperatorParamNumber("stride", int, 1, 1),
         OperatorParamNumber("padding", int, 1, 1),
-    ]),
-    OperatorDefinition("relu", "torch.relu", 1),
-    OperatorDefinition("sqrt", "torch.sqrt", 1),
-    OperatorDefinition("tanh", "torch.tanh", 1),
+    ], calc_input_shapes=conv2d_inputs),
+    OperatorDefinition("relu", "torch.relu", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("sqrt", "torch.sqrt", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("tanh", "torch.tanh", 1, calc_input_shapes=same_input_shapes),
     # OperatorDefinition("add", "torch.add", 1),
-    OperatorDefinition("add", "torch.add", 2),
+    OperatorDefinition("add", "torch.add", 2, calc_input_shapes=same_input_shapes),
 
     # Non-linear activation functions
     # HARDTANH = OperatorDefinition("hardtanh", 1)
@@ -56,8 +65,8 @@
     # LOCAL_RESPONSE_NORM = OperatorDefinition("local_response_norm", 1)
     # NORMALIZE = OperatorDefinition("normalize", 1)
 
-    OperatorDefinition("matmul", "torch.matmul", 2),
-    OperatorDefinition("eltwise", "torch.add", 2),
+    OperatorDefinition("matmul", "torch.matmul", 2, calc_input_shapes=matmul_inputs),
+    OperatorDefinition("eltwise", "torch.add", 2, calc_input_shapes=same_input_shapes),
 ]
 
 
diff --git a/pybuda/pybuda/op_repo/shapes.py b/pybuda/pybuda/op_repo/shapes.py
new file mode 100644
index 00000000..814a1d36
--- /dev/null
+++ b/pybuda/pybuda/op_repo/shapes.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+# Calculation of input shapes from output shapes for the specified operator
+
+
+from random import Random
+from typing import List
+
+from .datatypes import OperatorDefinition
+from .datatypes import TensorShape
+
+
+def same_input_shapes(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+    # each input operand has the same shape as the output
+    return [output_shape for _ in range(operator_definition.input_num)]
+
+
+def linear_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+    # linear layer changes the last dimension of the input shape
+    batch_shape = output_shape[:-1]
+    n = output_shape[-1]
+    n = randomize_size(n, rng_shape)
+    input_shapes = [batch_shape + (n,)]
+    return input_shapes
+
+
+# FIXME: conv2d in PyTorch not working properly in all cases
+def conv2d_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+    shape1 = output_shape[:1]
+    shape2 = output_shape[2:]
+    n = output_shape[1]
+    n = randomize_size(n, rng_shape)
+    input_shapes = [shape1 + (n,) + shape2]
+    return input_shapes
+
+
+def matmul_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+    batch_shape = output_shape[:-2]
+    m = output_shape[-2]
+    n = output_shape[-1]
+    # calculates inner dimension based on one of output shape dimensions
+    q = randomize_size(n, rng_shape)
+    input_shapes = [batch_shape + (m,q), batch_shape + (q,n)]
+    return input_shapes
+
+
+def randomize_size(n: int, rng_shape: Random) -> int:
+    '''Randomize size of an dimension based on size of another dimension.
+    Returns a random integer in the range [n/2, 3n/2] inclusive to keep the size of the dimension in a similar range.
+
+    Args:
+        n: size of an dimension
+        rng_shape: random number generator
+
+    Returns:
+        int: random size of an dimension
+    '''
+    return n + (rng_shape.randint(0, 1) * 2 - 1) * rng_shape.randint(1, int(n/2))
diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index 2ca39ee5..f2b4b33d 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -6,8 +6,8 @@
  * RANDOM\_TESTS\_SELECTED: Limiting random tests to only selected subset defined as comma separated list of test indexes. E.x. "3,4,6". Default is no limitation if not specified or empty.
  * MIN\_DIM: Minimal number of dimensions of input tensors. (default: 3)
  * MAX\_DIM: Maximum number of dimensions of input tensors. (default: 4)
- * MIN\_OP\_SIZE: Minimal size of an operator dimension. (default: 16)
- * MAX\_OP\_SIZE: Maximum size of an operator dimension. Smaller operator size results in fewer failed tests. (default: 512)
+ * MIN\_OP\_SIZE\_PER\_DIM: Minimal size of an operator dimension. (default: 16)
+ * MAX\_OP\_SIZE\_PER\_DIM: Maximum size of an operator dimension. Smaller operator size results in fewer failed tests. (default: 512)
  * MIN_MICROBATCH_SIZE: Minimal size of microbatch of an input tensor. (default: 1)
  * MAX_MICROBATCH_SIZE: Maximum size of microbatch of an input tensor. (default: 8)
  * NUM\_OF\_NODES: Maximum number of nodes to be generated by RGG. (default: 10)
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index 4d79cd8c..f6c60122 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -11,7 +11,7 @@
 from .base import Framework, GraphBuilder, ModelBuilder
 from .base import RandomizerRunner, RandomizerCodeGenerator, process_test
 from .frameworks import Frameworks
-from .algorithms import NodesUtils
+from .algorithms import GraphNodeSetup
 from .algorithms import RandomGraphAlgorithm
 
 __all__ = [
@@ -33,6 +33,6 @@
     "RandomizerCodeGenerator",
     "process_test",
     "Frameworks",
-    "NodesUtils",
+    "GraphNodeSetup",
     "RandomGraphAlgorithm",
 ]
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 306f3393..20f698a6 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -16,21 +16,23 @@
 from .utils import RandomUtils, StrUtils, NodeUtils
 
 
-class NodesUtils:
+class GraphNodeSetup:
+    '''Common step for completion of setting up and validating of the graph
+    after it's built by a graph builder algorithm'''
 
     # Whether to always generate unique variables for each node
     always_unique_variables = False
 
     @classmethod
-    def init_nodes(cls, graph: RandomizerGraph):
+    def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
         """
         Initializes the nodes of a graph. 
 
         This method does three main things:
         1. Sets the index for each node.
         2. Stores output values if they are needed as explicit input for a later operator.
-        3. Setting input nodes for open nodes
-        4. Validates the input configuration.
+        3. Setting input nodes for open nodes.
+        4. Generates random settings for operator parameters.
 
         Args:
             graph (RandomizerGraph): The graph nodes to initialize.
@@ -65,11 +67,37 @@ def init_nodes(cls, graph: RandomizerGraph):
 
         # Setting input nodes for open nodes
         for node in open_nodes:
-            for _ in range(node.operator.input_num - len(node.inputs)):
-                input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}")
-                graph.input_nodes.append(input_node)
+            input_shapes = node.input_shapes
+            for i in range(len(node.inputs), node.operator.input_num):
+                input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shapes[i] and input_node not in node.inputs]
+                if len(input_nodes_with_same_shape) > 0:
+                    # reuse existing input node with the same shape that is not already connected to the node
+                    input_node = input_nodes_with_same_shape[0]
+                else:
+                    input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}", input_shape=input_shapes[i])
+                    graph.input_nodes.append(input_node)
                 node.inputs.append(input_node)
 
+        # Generate random values for operator parameters
+        for node in nodes:
+            node.constructor_kwargs = RandomUtils.constructor_kwargs(node.operator, node.constructor_kwargs, rng_params)
+            node.forward_kwargs = RandomUtils.forward_kwargs(node.operator, node.forward_kwargs, rng_params)
+
+    @classmethod
+    def validate_graph(cls, graph: RandomizerGraph):
+        '''Validates the graph
+        1. Validates the number of inputs for each node
+        2. Validates operator class type
+
+        Args:
+            graph (RandomizerGraph): The graph to validate
+
+        Raises:
+            Exception: If the number of inputs for a node does not match the configured input number.
+            Exception: If the node operator is not of type RandomizerOperator.
+        '''
+        nodes = graph.nodes
+
         # Validation of input configuration
         for node in nodes:
             if node.operator.input_num and node.operator.input_num > 1:
@@ -81,14 +109,22 @@ def init_nodes(cls, graph: RandomizerGraph):
             if node.operator and not isinstance(node.operator, OperatorDefinition):
                 raise Exception(f"Step operator is wrong type {node.node_info()} expected RandomizerOperator got {type(node.operator)}")
 
-        nodes_str = StrUtils.nodes_to_str(nodes)
+    @classmethod
+    def prepare_graph(cls, graph: RandomizerGraph, rng_params: random.Random):
+        cls.init_nodes(graph, rng_params)
+        cls.validate_graph(graph)
+
+        nodes_str = StrUtils.nodes_to_str(graph.nodes)
         logger.trace(f"Nodes: \n{nodes_str}")
 
 
 class RandomGraphAlgorithm(GraphBuilder):
+    '''Implementation of the random graph building algorithm'''
 
     SKIP_OPERATORS = (
-        "matmul",  # skip matmul until shape calculation support is added
+        "sqrt",  # skip because it's failing for negative values
+        # "linear",
+        "conv2d",  # skip until calc_input_shapes is properly implemented
     )
 
     def __init__(self, framework: Framework, randomizer_config):
@@ -96,15 +132,34 @@ def __init__(self, framework: Framework, randomizer_config):
         self.framework = framework
         self.operators = [
             op for op in framework.operator_repository.operators
-            if
-                not op.is_layer() and
-                op.name not in self.SKIP_OPERATORS
+            if op.name not in self.SKIP_OPERATORS
         ]
 
-    def get_single_input_operator(self, rng):
+    def _get_random_operator(self, rng):
         return rng.choice(self.operators)
 
+    def _init_default_constructor_params(self, node: RandomizerNode):
+        '''Initializing default constructor parameters based on input and output shapes'''
+        # Operator specific settings
+        # TODO abstract this
+        if len([param for param in node.operator.constructor_params if param.name == "in_features"]) == 1:
+            node.constructor_kwargs["in_features"] = node.input_shapes[0][-1]
+        if len([param for param in node.operator.constructor_params if param.name == "out_features"]) == 1:
+            node.constructor_kwargs["out_features"] = node.output_shape[-1]
+        if len([param for param in node.operator.constructor_params if param.name == "in_channels"]) == 1:
+            node.constructor_kwargs["in_channels"] = node.input_shapes[0][1]
+        if len([param for param in node.operator.constructor_params if param.name == "out_channels"]) == 1:
+            node.constructor_kwargs["out_channels"] = node.output_shape[1]
+
+    # Build graph of random operators via random graph building algorithm
+    # Graph contains between num_of_nodes/2 and num_of_nodes nodes
+    # Graph is constructed backwards starting from end node
+    # In each step a random operator is selected and a new node is created
+    # New node is connected to the last node and optionally to a random node with the same input shape
+    # When new node is connected to 2 nodes graph contains a fork join
+    # Input shapes for each node are calculated based on output shape of the node
     def build_graph(self, test_context: RandomizerTestContext):
+        '''Implementation of the random graph building algorithm'''
         parameters = test_context.parameters
         graph = test_context.graph
         nodes = graph.nodes
@@ -112,50 +167,71 @@ def build_graph(self, test_context: RandomizerTestContext):
         # Initialize random number generators for graph building
         rng_graph = random.Random(parameters.random_seed)
 
+        # Initialize random number generators for shape generation
+        rng_shape = random.Random(test_context.parameters.random_seed)
+
+        # Initialize random number generators for parameters
+        rng_params = random.Random(test_context.parameters.random_seed)
+
         num_of_nodes = self.randomizer_config.num_of_nodes
 
+        # Building the graph with number of nodes between n/2 and n
+        # num_of_nodes defines max number of nodes in the graph
         for _ in range(rng_graph.randint(int(num_of_nodes/2), num_of_nodes)):
             # Choose operator randomly based on rng
-            op1 = self.get_single_input_operator(rng_graph)
-
-            # if op1.is_layer:
-            #     # Layers require number of input and output features
-            #     nodes.append(RandomizerNode(operator=op1, in_features=cols1, out_features=cols2))
-            # else:
-            #     nodes.append(RandomizerNode(operator=op1))
-
-            open_nodes = NodeUtils.get_open_nodes(nodes)
+            op1 = self._get_random_operator(rng_graph)
 
+            # Last node defines output shape for next node to create
             last_node: RandomizerNode = None
+            # Random node is selected by matching the same input shape to support fork joins
+            # TODO random_node -> random_nodes, select all random_nodes instead of just one
+            # TODO: obsolete last_node in flavor of random_nodes
             random_node: RandomizerNode = None
 
             if len(nodes) > 0:
+                # If graph is not empty find previusly added node
                 last_node = nodes[0]
 
+            if len(nodes) == 0:
+                # Setting output shape for the first node
+                output_shape = RandomUtils.random_shape_from_config(self.randomizer_config, rng_shape)
+            else:
+                # Setting output shape based on last node input shapes
+                input_shapes = last_node.input_shapes
+                output_shape = input_shapes[len(last_node.inputs)]
+
+            # Find open nodes with input shape mathing the output shape of new node
+            open_nodes = NodeUtils.get_open_nodes_with_input_shape(nodes, output_shape)
+
             if len(open_nodes) > 0:
+                # Randomly selecting one of the open nodes
                 random_node = rng_graph.choice(open_nodes)
 
             if last_node is not None and random_node is not None and last_node == random_node:
+                # Skip random_node if it's the same as last_node
                 random_node = None
 
+            # Closing nodes are last_node and optionally random_node
             closing_nodes = [closing_node for closing_node in [last_node, random_node] if closing_node is not None]
 
-            node = RandomizerNode(operator=op1)
+            # Creating new node
+            node = RandomizerNode(operator=op1, output_shape=output_shape)
+            # Saving input shapes for the new node
+            node.input_shapes = NodeUtils.calc_input_shapes(node, rng_shape)
+
+            # Initializing default constructor parameters based on input and output shapes
+            self._init_default_constructor_params(node)
 
             for closing_node in closing_nodes:
                 for _ in range(rng_graph.randint(1, closing_node.operator.input_num - len(closing_node.inputs))):
-                    closing_node.inputs.append(node)
+                    # currently only if next input of closing node matches the output shape a closing node will be actually closed
+                    # TODO check all inputs for matching shapes not just next one
+                    # if second operands is different shape than first one it will most likely not be closed with an internal node but with external input
+                    # e.x. second operand of matmul usually connect to external input instead of an internal node
+                    if closing_node.input_shapes[len(closing_node.inputs)] == node.output_shape:
+                        closing_node.inputs.append(node)
 
             open_nodes.append(node)
             nodes.insert(0, node)
 
-        NodesUtils.init_nodes(graph)
-
-        # Initialize random number generators for shape generation
-        rng_shape = random.Random(test_context.parameters.random_seed)
-
-        # Provide input shapes for validation
-        # TODO support operands with different shapes
-        input_shape = RandomUtils.random_shape_from_config(self.randomizer_config, rng_shape)
-        for input_node in test_context.graph.input_nodes:
-            input_node.input_shape = input_shape
+        GraphNodeSetup.prepare_graph(graph, rng_params)
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 209ff744..255e6b0c 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -16,7 +16,8 @@
 from test.conftest import TestDevice
 from .datatypes import RandomizerNode, RandomizerGraph, RandomizerParameters, RandomizerConfig, ExecutionContext
 from .datatypes import RandomizerTestContext
-from .utils import RandomUtils, StrUtils, GraphUtils
+from .datatypes import TensorShape
+from .utils import StrUtils, GraphUtils
 
 
 class GraphBuilder:
@@ -67,30 +68,17 @@ def __init__(self, template_dir: str):
         self.template = Environment(loader=FileSystemLoader(template_dir)).get_template('generated_model.jinja2')
 
     def constructor_kwargs(self, node: RandomizerNode):
-        constructor_kwargs = RandomUtils.constructor_kwargs(node.operator)
-        return StrUtils.kwargs_str(**constructor_kwargs)
+        return StrUtils.kwargs_str(**node.constructor_kwargs)
 
     def forward_args(self, node: RandomizerNode) -> str:
         args_str = ", ".join([f"inputs[{i}]" for i in range(node.operator.input_num)])
         return args_str
     
     def forward_kwargs(self, node: RandomizerNode) -> str:
-        forward_kwargs = RandomUtils.forward_kwargs(node.operator)
-        return StrUtils.kwargs_str(**forward_kwargs)
+        return StrUtils.kwargs_str(**node.forward_kwargs)
 
-    # TODO obsolete by constructor_kwargs
-    def build_layer(self, node: RandomizerNode) -> str:
-        if node.operator.is_layer() and node.operator.full_name is not None:
-            return f"{node.operator.full_name}({node.in_features}, {node.out_features} {self.constructor_kwargs(node)})"
-        else:
-            raise Exception(f"Unsupported layer building for {node.node_info()}")
-
-    # def call_operator(self, ctx: ExecutionContext) -> str:
-    #     if ctx.node.operator.is_operator() and ctx.node.operator.full_name is not None:
-    #         v = f"{ctx.node.operator.full_name}('{ctx.node.node_name()}', {self.forward_args(ctx.node)} {self.forward_kwargs(ctx.node)})"
-    #     else:
-    #         raise Exception(f"Unsupported operator call for {ctx.node.node_info()}")
-    #     return v
+    def reduce_microbatch_size(self, shape: TensorShape) -> str:
+        return (1, ) + shape[1:]
 
     def generate_code(self, test_context: RandomizerTestContext, test_format: bool = True) -> str:
         # TODO setup random seed in generated test function
@@ -99,17 +87,17 @@ def generate_code(self, test_context: RandomizerTestContext, test_format: bool =
         template = self.template
 
         code_str = template.render(
+            randomizer_config = test_context.randomizer_config,
             graph_builder_name = parameters.graph_builder_name,
             test_id = StrUtils.test_id(test_context),
             test_format = test_format,
             test_index = parameters.test_index,
             random_seed = parameters.random_seed,
             graph=test_context.graph,
-            build_layer=self.build_layer,
-            # call_operator=self.call_operator,
             constructor_kwargs=self.constructor_kwargs,
             forward_args=self.forward_args,
             forward_kwargs=self.forward_kwargs,
+            reduce_microbatch_size=self.reduce_microbatch_size,
             ExecutionContext=ExecutionContext,
             )
 
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index 96635d7a..f1d1d8b7 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -19,11 +19,14 @@ def get_randomizer_config_default():
         run_test = True,
         save_tests = True,
         # build_model_from_code = False,
+        verify_shapes = False,
         # TODO ranges
-        dim_min=int(os.environ.get("MIN_DIM", 3)),
+        # dim_min=int(os.environ.get("MIN_DIM", 3)),
+        dim_min=int(os.environ.get("MIN_DIM", 4)),  # Until #2722 is resolved
         dim_max=int(os.environ.get("MAX_DIM", 4)),
-        op_size_min=int(os.environ.get("MIN_OP_SIZE", 16)),
-        op_size_max=int(os.environ.get("MAX_OP_SIZE", 512)),
+        op_size_per_dim_min=int(os.environ.get("MIN_OP_SIZE_PER_DIM", 16)),
+        op_size_per_dim_max=int(os.environ.get("MAX_OP_SIZE_PER_DIM", 64)),  # by default run with smaller sizes
+        # op_size_per_dim_max=int(os.environ.get("MAX_OP_SIZE_PER_DIM", 512)),
         microbatch_size_min=int(os.environ.get("MIN_MICROBATCH_SIZE", 1)),
         microbatch_size_max=int(os.environ.get("MAX_MICROBATCH_SIZE", 8)),
         num_of_nodes=int(os.environ.get("NUM_OF_NODES", 10)),
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 66d1d715..b66924d9 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -17,8 +17,8 @@
 
 @dataclass
 class RandomizerInputNode:
-    out_value: Optional[str] = None
-    input_shape: Optional[TensorShape] = None  # will be set by the graph_builder.init_shapes later
+    out_value: str
+    input_shape: TensorShape
 
 
 @dataclass
@@ -26,9 +26,11 @@ class RandomizerNode:
     index: Optional[int] = None
     out_value: Optional[str] = None
     operator: Optional[OperatorDefinition] = None
-    in_features: Optional[int] = None
-    out_features: Optional[int] = None
     inputs: List['RandomizerNode'] = field(default_factory=list)
+    constructor_kwargs: Dict[str, object] = field(default_factory=dict)
+    forward_kwargs: Dict[str, object] = field(default_factory=dict)
+    input_shapes: List[TensorShape] = field(default_factory=list)
+    output_shape: TensorShape = None
 
     def operator_name(self):
         return f"op{self.index}"
@@ -81,10 +83,11 @@ class RandomizerConfig:
     test_dir:str = "pybuda/test/random_tests"
     save_tests: bool = False
     # build_model_from_code: bool = False  # TODO remove obsoleted
+    verify_shapes: bool = False,
     dim_min: int = 3
     dim_max: int = 4
-    op_size_min: int = 16
-    op_size_max: int = 512
+    op_size_per_dim_min: int = 16
+    op_size_per_dim_max: int = 512
     microbatch_size_min: int = 1
     microbatch_size_max: int = 8
     num_of_nodes: int = 10
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index ec470817..1575046c 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -17,14 +17,18 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
         super(GeneratedTestModel_{{ test_index }}_{{ random_seed }}, self).__init__(module_name)
         self.testname = "Operator Test GeneratedTestModel_{{ test_id }}"
 {% for node in graph.nodes %}{% if node.operator.is_layer() %}        
-        self.{{ node.layer_name() }} = {{ build_layer(node) }}{% endif %}{% endfor %}
+        self.{{ node.layer_name() }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
 
-    def forward(self{% for node in graph.input_nodes %}, {{ node.out_value }}: pybuda.Tensor{% endfor %}) -> pybuda.Tensor:
+    def forward(self{% for node in graph.input_nodes %},
+            {{ node.out_value }}: pybuda.Tensor{% endfor %}
+        ) -> pybuda.Tensor:
         {% for node in graph.nodes %}
 
+        # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
         inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if node.operator.is_layer() %}
         {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
-        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name() }}', {{ forward_args(node=node) }}{{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% endfor %}
+        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name() }}', {{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
+        assert {{ node.out_value }}.shape == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
 
         return v
 {% if test_format %}
@@ -32,7 +36,9 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
 # @pytest.mark.skip(reason="Skip this test for now.")
 def test_gen_model_{{ test_index }}_{{ random_seed }}(test_device):
     
-    input_shapes = [{% for input_node in graph.input_nodes %}{{ input_node.input_shape }}, {% endfor %}]
+    input_shapes = [
+        {% for input_node in graph.input_nodes %}{{ input_node.input_shape }},
+        {% endfor %}]
     model = GeneratedTestModel_{{ test_index }}_{{ random_seed }}("pytest_gen_model_{{ test_id }}")
 
     verify_module(model, input_shapes, VerifyConfig(devtype=test_device.devtype, arch=test_device.arch))
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index 8b11f095..1f7319d6 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -14,14 +14,18 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
     def __init__(self):
         super(GeneratedTestModel_{{ test_index }}_{{ random_seed }}, self).__init__()
 {% for node in graph.nodes %}{% if node.operator.is_layer() %}        
-        self.{{ node.layer_name() }} = {{ build_layer(node) }}{% endif %}{% endfor %}
+        self.{{ node.layer_name() }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
 
-    def forward(self{% for node in graph.input_nodes %}, {{ node.out_value }}: torch.Tensor{% endfor %}) -> torch.Tensor:
+    def forward(self{% for node in graph.input_nodes %},
+            {{ node.out_value }}: torch.Tensor{% endfor %}
+        ) -> torch.Tensor:
         {% for node in graph.nodes %}
 
+        # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
         inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if node.operator.is_layer() %}
         {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
-        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}({{ forward_args(node=node) }}{{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% endfor %}
+        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}({{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
+        assert {{ node.out_value }}.shape == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
 
         return v
 {% if test_format %}
@@ -29,7 +33,9 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
 # @pytest.mark.skip(reason="Skip this test for now.")
 def test_gen_model_{{ test_index }}_{{ random_seed }}(test_device):
     
-    input_shapes = [{% for input_node in graph.input_nodes %}{{ input_node.input_shape }}, {% endfor %}]
+    input_shapes = [
+        {% for input_node in graph.input_nodes %}{{ input_node.input_shape }},
+        {% endfor %}]
     pytorch_model = GeneratedTestModel_{{ test_index }}_{{ random_seed }}()
     model = pybuda.PyTorchModule("pytest_gen_model_{{ test_id }}", pytorch_model)
 
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index ea7a3288..de978c33 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -10,9 +10,7 @@
 import re
 import yaml
 
-import numpy as np
-
-from pybuda.op_repo import OperatorParam, OperatorParamNumber
+from pybuda.op_repo import OperatorParam, OperatorDefinition, OperatorParamNumber
 
 from .datatypes import TensorShape
 from .datatypes import RandomizerConfig, RandomizerTestContext, RandomizerNode, RandomizerGraph
@@ -23,8 +21,6 @@ class StrUtils:
     @staticmethod
     def kwargs_str(**kwargs):
         s = ', '.join([f"{key}= {value}" for key, value in kwargs.items()])
-        if s:
-            s = ", " + s
         return s
 
     @staticmethod
@@ -56,30 +52,30 @@ def nodes_to_str(nodes: List[RandomizerNode]) -> str:
 class RandomUtils:
 
     @classmethod
-    def random_value_for_param(cls, param: OperatorParam):
+    def random_value_for_param(cls, param: OperatorParam, rng_params: random.Random):
         if isinstance(param, OperatorParamNumber):
-            return cls.random_value_for_number_param(param)
+            return cls.random_value_for_number_param(param, rng_params)
         else:
             raise ValueError(f"Unsupported param type {type(param)}")
 
     @classmethod
-    def random_value_for_number_param(cls, param: OperatorParamNumber) -> int:
-        # TODO: reuse seed
+    def random_value_for_number_param(cls, param: OperatorParamNumber, rng_params: random.Random) -> int:
         # TODO: support open intervals
+        # TODO: store rng_params in test_context
         if param.type == float:
-            return np.random.uniform(param.min_value, param.max_value)
+            return rng_params.uniform(param.min_value, param.max_value)
         elif param.type == int:
-            return np.random.randint(param.min_value, param.max_value + 1)
+            return rng_params.randint(param.min_value, param.max_value + 1)
         else:
             raise ValueError(f"Unsupported type {param.type}")
 
     @classmethod
-    def constructor_kwargs(cls, param: OperatorParam) -> Dict:
-        return {param.name: cls.random_value_for_param(param) for param in param.constructor_params}
+    def constructor_kwargs(cls, operator: OperatorDefinition, constructor_kwargs: Dict[str, object], rng_params: random.Random) -> Dict:
+        return {param.name: cls.random_value_for_param(param, rng_params) if param.name not in constructor_kwargs else constructor_kwargs[param.name] for param in operator.constructor_params}
 
     @classmethod
-    def forward_kwargs(cls, param: OperatorParam) -> Dict:
-        return {param.name: cls.random_value_for_param(param) for param in param.forward_params}
+    def forward_kwargs(cls, operator: OperatorDefinition, forward_kwargs: Dict[str, object], rng_params: random.Random) -> Dict:
+        return {param.name: cls.random_value_for_param(param, rng_params) if param.name not in forward_kwargs else forward_kwargs[param.name] for param in operator.forward_params}
 
     @classmethod
     def random_shape(cls,
@@ -100,8 +96,8 @@ def random_shape(cls,
 
     @classmethod
     def random_shape_from_config(cls, randomizer_config: RandomizerConfig, rng_shape: random.Random) -> TensorShape:
-        op_size_min = randomizer_config.op_size_min
-        op_size_max = randomizer_config.op_size_max
+        op_size_min = randomizer_config.op_size_per_dim_min
+        op_size_max = randomizer_config.op_size_per_dim_max
 
         dim_min = randomizer_config.dim_min
         dim_max = randomizer_config.dim_max
@@ -160,3 +156,12 @@ def is_open(node: RandomizerNode) -> bool:
     @classmethod
     def get_open_nodes(cls, nodes: List[RandomizerNode]) -> List[RandomizerNode]:
         return [node for node in nodes if cls.is_open(node)]
+
+    @classmethod
+    def get_open_nodes_with_input_shape(cls, nodes: List[RandomizerNode], input_shape: TensorShape) -> List[RandomizerNode]:
+        # TODO support checking not just next operand but all not connected operands
+        return [node for node in nodes if cls.is_open(node) and node.input_shapes[len(node.inputs)] == input_shape]
+
+    @classmethod
+    def calc_input_shapes(cls, node: RandomizerNode, rng_shape: random.Random) -> List[TensorShape]:
+        return node.operator.calc_input_shapes(node.operator, node.output_shape, rng_shape)
diff --git a/pybuda/test/random/test_three_ops.py b/pybuda/test/random/test_three_ops.py
index 16586334..e6b474a1 100644
--- a/pybuda/test/random/test_three_ops.py
+++ b/pybuda/test/random/test_three_ops.py
@@ -58,8 +58,8 @@ def test_three_ops(test_index, random_seeds, test_device, randomizer_config):
     random_seed = random_seeds[test_index]
     rng = random.Random(random_seed)
 
-    op_size_min = randomizer_config.op_size_min
-    op_size_max = randomizer_config.op_size_max
+    op_size_min = randomizer_config.op_size_per_dim_min
+    op_size_max = randomizer_config.op_size_per_dim_max
     
     rows = rng.randint(op_size_min, op_size_max)
     cols1 = rng.randint(op_size_min, op_size_max)

From 3e8d19f3b95b103da2700fff6dcf165ff13a6211 Mon Sep 17 00:00:00 2001
From: Nikola Obradovic <nobradovic@tenstorrent.com>
Date: Thu, 13 Jun 2024 08:25:05 +0000
Subject: [PATCH 04/29] [Ribbon2] Switch to util function for linked output
 nodes.

(cherry picked from commit 75eb3bc03afec42b50cdc76af8da5ce5c04945a3)
---
 .../csrc/balancer/policies/policy_ribbon2.cpp   | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp b/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
index 9ccb7c18..e3c3edbf 100644
--- a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
+++ b/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
@@ -92,19 +92,13 @@ std::optional<OpModel> get_closest_op_model_conservative(
     return closest_model;
 }
 
-bool operand_of_datacopy_output_node(const graphlib::Graph *graph, const graphlib::Node *node)
+bool operand_of_linked_output_node(const graphlib::Graph *graph, const graphlib::Node *node)
 {
     for (const graphlib::Node *user_node : graph->data_users(node))
     {
-        if (user_node->node_type() == graphlib::NodeType::kOutput)
+        if (user_node->node_type() == graphlib::NodeType::kOutput and is_linked_queue(graph, user_node))
         {
-            std::function<bool(tt::graphlib::Edge)> is_partial_datacopy_edge = [](Edge e)
-            { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(user_node, is_partial_datacopy_edge);
-            if (!partial_datacopy_edges.empty())
-            {
-                return true;
-            }
+            return true;
         }
     }
 
@@ -180,9 +174,10 @@ EpochSolution optimize_solution_conservative(
         {
             const OpModel &source_op_model = best_solution.get_selected_op_models()[op_index];
 
-            // Mitigation for datacopy output nodes. We don't want to bump up the grid of the datacopy output node.
+            // Mitigation for linked output nodes. We don't want to bump up the grid of the linked output node because
+            // of higher chance of op_model mismatch on OPs feeding the fake output.
             //
-            if (operand_of_datacopy_output_node(graph, source_op_model.buda_op_node))
+            if (operand_of_linked_output_node(graph, source_op_model.buda_op_node))
             {
                 continue;
             }

From c2e1c3343a753e0bff1625b082f4f90611b405b1 Mon Sep 17 00:00:00 2001
From: Konstantin Milanovic <kmilanovic@tenstorrent.com>
Date: Thu, 13 Jun 2024 09:04:33 +0000
Subject: [PATCH 05/29] Move sparse matmul op tests from sanity

(cherry picked from commit 1ca541bcea483bd1708353f9de25412a2c4c1646)
---
 .../operators/matmul/test_sparse_matmul.py    | 152 ++++++++++++++++++
 pybuda/test/test_sanity.py                    | 130 ---------------
 2 files changed, 152 insertions(+), 130 deletions(-)
 create mode 100644 pybuda/test/operators/matmul/test_sparse_matmul.py

diff --git a/pybuda/test/operators/matmul/test_sparse_matmul.py b/pybuda/test/operators/matmul/test_sparse_matmul.py
new file mode 100644
index 00000000..d43c22a7
--- /dev/null
+++ b/pybuda/test/operators/matmul/test_sparse_matmul.py
@@ -0,0 +1,152 @@
+
+# From sanity
+
+import pytest
+
+from pybuda.config import _get_global_compiler_config
+from pybuda import Tensor
+import torch
+
+from pybuda import pybuda
+
+from pybuda.verify.config import TestKind, VerifyConfig
+
+from test.common import run
+
+from pybuda.module import PyBudaModule
+
+
+
+
+# Tests from sanity moved here:
+
+@pytest.mark.parametrize("config", ["3x3conv", "data_mismatch", "c_stream", "in_out_stream"])
+def test_sparse_matmul(test_device, config):
+    from pybuda.op.eval.sparse_utils import create_conv2d_sparse_picker_matrix
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
+
+    if config == "3x3conv":
+        iH, iW = (64, 64)
+        inC = 32
+        kH, kW = (3, 3)
+        stride = (2, 2)
+        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
+        dilation = 1
+
+        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
+        act = Tensor.create_from_torch(t)
+
+        pickers = []
+        for kY in range(kH):
+            for kX in range(kW):
+                y_shift = ((kH - 1) // 2) - kY
+                x_shift = ((kW - 1) // 2) - kX
+                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
+                pickers.append(picker)
+        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
+    elif config == "data_mismatch":
+        minimal_tiles = 2
+        act = torch.randn(32*minimal_tiles,32).unsqueeze(0).unsqueeze(0)
+        out_tiles = minimal_tiles // 2
+        eye = torch.eye(32*minimal_tiles, 32*minimal_tiles)
+        pickers = [
+            eye[:(out_tiles*32), :].to_sparse(),
+            eye[(out_tiles*32-16):-16, :].to_sparse(),
+        ]
+        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
+    elif config == "c_stream":
+
+        pytest.skip() # tenstorrent/budabackend#1543
+        pybuda.config.override_t_stream_dir("sparse0.lc2", "C")
+        pybuda.config.override_t_stream_shape("sparse0.lc2", (1, 32))
+        iH, iW = (64, 64)
+        inC = 1024
+        kH, kW = (1, 1)
+        stride = (2, 2)
+        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
+        dilation = 1
+
+        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
+        act = Tensor.create_from_torch(t)
+
+        pickers = []
+        for kY in range(kH):
+            for kX in range(kW):
+                y_shift = ((kH - 1) // 2) - kY
+                x_shift = ((kW - 1) // 2) - kX
+                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
+                pickers.append(picker)
+        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
+    elif config == "in_out_stream":
+        pybuda.config.override_t_stream_dir("buf0", "R")
+        pybuda.config.override_t_stream_shape("buf0", (2, 1))
+        pybuda.config.override_t_stream_dir("sparse0.lc2", "R")
+        pybuda.config.override_t_stream_shape("sparse0.lc2", (3, 1))
+
+        iH, iW = (32, 32)
+        inC = 32
+        kH, kW = (3, 3)
+        stride = (2, 2)
+        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
+        dilation = 1
+
+        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
+        act = Tensor.create_from_torch(t)
+
+        pickers = []
+        for kY in range(kH):
+            for kX in range(kW):
+                y_shift = ((kH - 1) // 2) - kY
+                x_shift = ((kW - 1) // 2) - kX
+                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
+                pickers.append(picker)
+        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
+    else:
+        raise RuntimeError("Unknown config")
+
+    @run(
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch),
+    )
+    def simple_sparse_matmul(act, sparse=None):
+        if config == "in_out_stream":
+            act = pybuda.op.Buffer("buf0", act)
+        return pybuda.op.SparseMatmul("sparse0", sparse, act)
+
+    simple_sparse_matmul(act, sparse=sparse)
+
+    
+def test_z_sparse_matmul(test_device):
+    input_shape = (1, 64, 128, 128)
+
+    class Model(PyBudaModule):
+        def __init__(self):
+            super().__init__(name="sparsematmul_test")
+            rows = torch.arange(0, 128).tolist()
+            cols = rows
+            sparse = torch.sparse_coo_tensor([rows, cols],torch.ones(len(cols)), (128, 128), dtype=torch.float32)
+            sparse = torch.stack([sparse]*64, -3)
+            sparse = torch.unsqueeze(sparse, 0) 
+            self.add_constant("sparse")
+            self.set_constant("sparse", pybuda.Tensor.create_from_torch(sparse, constant=True))
+
+        def forward(self, x):
+            out = pybuda.op.SparseMatmul("", self.get_constant("sparse"), x)
+            return out
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
+
+    pybuda.verify.verify_module(
+        Model(),
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
\ No newline at end of file
diff --git a/pybuda/test/test_sanity.py b/pybuda/test/test_sanity.py
index 057415b8..8c90314e 100644
--- a/pybuda/test/test_sanity.py
+++ b/pybuda/test/test_sanity.py
@@ -570,136 +570,6 @@ def test_recompute(test_device):
             VerifyConfig(test_kind=TestKind.TRAINING_RECOMPUTE, devtype=test_device.devtype, arch=test_device.arch, 
                 epoch_breaks=[f"matmul_{i}" for i in range(0, num_matmuls, 2)]))
 
-def test_z_sparse_matmul(test_device):
-    input_shape = (1, 64, 128, 128)
-
-    class Model(PyBudaModule):
-        def __init__(self):
-            super().__init__(name="sparsematmul_test")
-            rows = torch.arange(0, 128).tolist()
-            cols = rows
-            sparse = torch.sparse_coo_tensor([rows, cols],torch.ones(len(cols)), (128, 128), dtype=torch.float32)
-            sparse = torch.stack([sparse]*64, -3)
-            sparse = torch.unsqueeze(sparse, 0) 
-            self.add_constant("sparse")
-            self.set_constant("sparse", pybuda.Tensor.create_from_torch(sparse, constant=True))
-
-        def forward(self, x):
-            out = pybuda.op.SparseMatmul("", self.get_constant("sparse"), x)
-            return out
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-
-    pybuda.verify.verify_module(
-        Model(),
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        ),
-    )
-    
-@pytest.mark.parametrize("config", ["3x3conv", "data_mismatch", "c_stream", "in_out_stream"])
-def test_sparse_matmul(test_device, config):
-    from pybuda.op.eval.sparse_utils import create_conv2d_sparse_picker_matrix
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-
-    if config == "3x3conv":
-        iH, iW = (64, 64)
-        inC = 32
-        kH, kW = (3, 3)
-        stride = (2, 2)
-        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
-        dilation = 1
-
-        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
-        act = Tensor.create_from_torch(t)
-
-        pickers = []
-        for kY in range(kH):
-            for kX in range(kW):
-                y_shift = ((kH - 1) // 2) - kY
-                x_shift = ((kW - 1) // 2) - kX
-                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
-                pickers.append(picker)
-        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
-    elif config == "data_mismatch":
-        minimal_tiles = 2
-        act = torch.randn(32*minimal_tiles,32).unsqueeze(0).unsqueeze(0)
-        out_tiles = minimal_tiles // 2
-        eye = torch.eye(32*minimal_tiles, 32*minimal_tiles)
-        pickers = [
-            eye[:(out_tiles*32), :].to_sparse(),
-            eye[(out_tiles*32-16):-16, :].to_sparse(),
-        ]
-        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
-    elif config == "c_stream":
-
-        pytest.skip() # tenstorrent/budabackend#1543
-        pybuda.config.override_t_stream_dir("sparse0.lc2", "C")
-        pybuda.config.override_t_stream_shape("sparse0.lc2", (1, 32))
-        iH, iW = (64, 64)
-        inC = 1024
-        kH, kW = (1, 1)
-        stride = (2, 2)
-        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
-        dilation = 1
-
-        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
-        act = Tensor.create_from_torch(t)
-
-        pickers = []
-        for kY in range(kH):
-            for kX in range(kW):
-                y_shift = ((kH - 1) // 2) - kY
-                x_shift = ((kW - 1) // 2) - kX
-                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
-                pickers.append(picker)
-        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
-    elif config == "in_out_stream":
-        pybuda.config.override_t_stream_dir("buf0", "R")
-        pybuda.config.override_t_stream_shape("buf0", (2, 1))
-        pybuda.config.override_t_stream_dir("sparse0.lc2", "R")
-        pybuda.config.override_t_stream_shape("sparse0.lc2", (3, 1))
-
-        iH, iW = (32, 32)
-        inC = 32
-        kH, kW = (3, 3)
-        stride = (2, 2)
-        padding = (kW // 2, kW // 2, kH // 2, kH // 2)
-        dilation = 1
-
-        t = torch.arange(iH*iW*inC, dtype=torch.float32).reshape((1, 1, iH * iW, inC))
-        act = Tensor.create_from_torch(t)
-
-        pickers = []
-        for kY in range(kH):
-            for kX in range(kW):
-                y_shift = ((kH - 1) // 2) - kY
-                x_shift = ((kW - 1) // 2) - kX
-                picker = create_conv2d_sparse_picker_matrix(iH, iW, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True)
-                pickers.append(picker)
-        sparse = Tensor.create_from_torch(torch.stack(pickers).unsqueeze(0), constant=True)
-    else:
-        raise RuntimeError("Unknown config")
-
-    @run(
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch),
-    )
-    def simple_sparse_matmul(act, sparse=None):
-        if config == "in_out_stream":
-            act = pybuda.op.Buffer("buf0", act)
-        return pybuda.op.SparseMatmul("sparse0", sparse, act)
-
-    simple_sparse_matmul(act, sparse=sparse)
-
 
 def test_simple_clip(test_device):
     compiler_cfg = _get_global_compiler_config()

From ca2fc89f485c621c8e16087bb014584635a1b577 Mon Sep 17 00:00:00 2001
From: mramanathan <mramanathan@tenstorrent.com>
Date: Mon, 10 Jun 2024 09:39:13 +0000
Subject: [PATCH 06/29] Add CCM test for PIDNet in Wormhole_B0(pytorch)

(cherry picked from commit 8fb00f9b9eed67f073162677f9b8ef8fa2dc6a28)
---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 7aba57f5..e3d00baf 100644
--- a/README.md
+++ b/README.md
@@ -49,4 +49,3 @@ Set `LD_LIBRARY_PATH` to the location of `third_party/budabackend/build/lib` - p
 ## Silicon
 
 See README.silicon.md for details on how to run on silicon.
-

From 5d347560964889f5c1988c9699de30dd69192470 Mon Sep 17 00:00:00 2001
From: Radenko Pavlovic <rpavlovic@tenstorrent.com>
Date: Thu, 13 Jun 2024 13:36:30 +0000
Subject: [PATCH 07/29] Enable NOC and DRAM estimates by default

(cherry picked from commit 159ce461a7449fe37454ca9b72fa2651c925b83c)
---
 pybuda/csrc/balancer/policies/policy_utils.cpp | 4 ++--
 pybuda/test/benchmark/benchmark/models/deit.py | 2 ++
 pybuda/test/benchmark/benchmark/models/t5.py   | 6 ++++--
 pybuda/test/benchmark/benchmark/models/unet.py | 2 ++
 pybuda/test/benchmark/benchmark/models/vit.py  | 2 ++
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pybuda/csrc/balancer/policies/policy_utils.cpp b/pybuda/csrc/balancer/policies/policy_utils.cpp
index fb781309..cbd0fc68 100644
--- a/pybuda/csrc/balancer/policies/policy_utils.cpp
+++ b/pybuda/csrc/balancer/policies/policy_utils.cpp
@@ -1880,10 +1880,10 @@ OpCycleEstimates get_op_cycles_estimates(
     static const bool disable_model_kb_prologue_bw = env_as<bool>("PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW", false);
 
     // Should we use estimates for the NOC bandwidth.
-    static const bool use_noc_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES", false);
+    static const bool use_noc_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES", true);
 
     // Should we use estimates for the DRAM bandwidth.
-    static const bool use_dram_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES", false);
+    static const bool use_dram_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES", true);
 
     const float inefficency_divider = 2.0;
     const float subchannel_oversub_coeff = 1.5;
diff --git a/pybuda/test/benchmark/benchmark/models/deit.py b/pybuda/test/benchmark/benchmark/models/deit.py
index b0174267..ff195aa4 100644
--- a/pybuda/test/benchmark/benchmark/models/deit.py
+++ b/pybuda/test/benchmark/benchmark/models/deit.py
@@ -25,6 +25,8 @@ def deit(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Disable NOC BW estimates.
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #
diff --git a/pybuda/test/benchmark/benchmark/models/t5.py b/pybuda/test/benchmark/benchmark/models/t5.py
index 867663af..db21a570 100644
--- a/pybuda/test/benchmark/benchmark/models/t5.py
+++ b/pybuda/test/benchmark/benchmark/models/t5.py
@@ -23,6 +23,8 @@ def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, da
     if data_type == "Fp16_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Disable DRAM BW estimates.
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #
@@ -81,7 +83,7 @@ def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: st
         variant = "google/flan-t5-large"
     else:
         raise RuntimeError("Unknown config")
-    
+
     if data_type == "Bfp8_b":
         pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
         pybuda.config.configure_mixed_precision(op_type="subtract", output_df=pybuda.DataFormat.Float16_b)
@@ -92,7 +94,7 @@ def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: st
         generate_test_device(devtype, arch),
         variant,
     )
- 
+
     targets = tuple()
 
     return modules, inputs, targets, other
diff --git a/pybuda/test/benchmark/benchmark/models/unet.py b/pybuda/test/benchmark/benchmark/models/unet.py
index 165673ff..57f7c686 100644
--- a/pybuda/test/benchmark/benchmark/models/unet.py
+++ b/pybuda/test/benchmark/benchmark/models/unet.py
@@ -24,6 +24,8 @@ def unet(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Disable NOC BW estimates.
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # Manually enable amp light for Ribbon
     if compiler_cfg.balancer_policy == "Ribbon":
diff --git a/pybuda/test/benchmark/benchmark/models/vit.py b/pybuda/test/benchmark/benchmark/models/vit.py
index aa011328..3962bda1 100644
--- a/pybuda/test/benchmark/benchmark/models/vit.py
+++ b/pybuda/test/benchmark/benchmark/models/vit.py
@@ -24,6 +24,8 @@ def vit(training: bool, config: str, microbatch: int, devtype: str, arch: str, d
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Disable NOC BW estimates.
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #

From 761678a2bd3d4d26b0319141692633866a66e24d Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <sdjordjevic@tenstorrent.com>
Date: Thu, 13 Jun 2024 14:38:02 +0000
Subject: [PATCH 08/29] Revert of "Revert "Adding a support for generating TTI
 image for Blackhole" due to..." with the fix

(cherry picked from commit 5a0e1822f7f639b302c94a4ff81972aa75e9957f)
---
 pybuda/pybuda/config.py                                |  1 +
 pybuda/pybuda/tti/runtime_param_yamls/bh_syslevel.yaml | 10 ++++++++++
 pybuda/test/conftest.py                                |  7 ++++---
 setup.py                                               |  6 +++++-
 4 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 pybuda/pybuda/tti/runtime_param_yamls/bh_syslevel.yaml

diff --git a/pybuda/pybuda/config.py b/pybuda/pybuda/config.py
index 30b7c33e..9770502e 100644
--- a/pybuda/pybuda/config.py
+++ b/pybuda/pybuda/config.py
@@ -423,6 +423,7 @@ def get_harvesting_mask(row_indices: List[int]):
     "galaxy"       : "tti/runtime_param_yamls/galaxy_syslevel.yaml",
     "gs_e150"      : "tti/runtime_param_yamls/gs_e150_syslevel.yaml",
     "gs_e300"      : "tti/runtime_param_yamls/gs_e300_syslevel.yaml",
+    "bh_test"      : "tti/runtime_param_yamls/bh_syslevel.yaml",
 }
 
 # Global compiler configuration
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/bh_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/bh_syslevel.yaml
new file mode 100644
index 00000000..747ab51b
--- /dev/null
+++ b/pybuda/pybuda/tti/runtime_param_yamls/bh_syslevel.yaml
@@ -0,0 +1,10 @@
+system_level_params:
+  system-device-chip_locations: 0,0,0,0,0,-
+  system-device-chips_with_mmio: 0-
+  system-device-cluster_descriptor: ""
+  system-device-ethernet_connections: ""
+  system-device-num_mmio_devices: 1
+  system-device-number_of_chips: 1
+  system-device0-harvesting_mask: 2048
+  system-device0-num_harvested_rows: 1
+  system-device0-type: BLACKHOLE
\ No newline at end of file
diff --git a/pybuda/test/conftest.py b/pybuda/test/conftest.py
index 0da2c75e..06371506 100644
--- a/pybuda/test/conftest.py
+++ b/pybuda/test/conftest.py
@@ -174,6 +174,7 @@ def no_skip(*args, **kwargs):
     "wh_n150": BackendDevice.Wormhole_B0,
     "wh_n300": BackendDevice.Wormhole_B0,
     "galaxy": BackendDevice.Wormhole_B0,
+    "bh_test": BackendDevice.Blackhole,
 }
 
 @dataclass
@@ -274,6 +275,9 @@ def pytest_generate_tests(metafunc):
 
     if "training" in metafunc.fixturenames:
         metafunc.parametrize("training", (False, True), ids=["inference", "training"])
+    
+    # Configure backend runtime yaml
+    device_cfg_global = metafunc.config.getoption("--device-config")
 
     if "test_device" in metafunc.fixturenames:
         names = ["Golden", "Model", "Versim", "Emulation", "Grayskull", "Wormhole_B0", "Blackhole"]
@@ -309,9 +313,6 @@ def pytest_generate_tests(metafunc):
         
         metafunc.parametrize("test_device", params, ids=ids)
 
-    # Configure backend runtime yaml
-    device_cfg_global = metafunc.config.getoption("--device-config")
-
 environ_before_test = None
 def pytest_runtest_logreport(report):
     if report.when == "setup":
diff --git a/setup.py b/setup.py
index 3684dd10..e92b2e88 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,10 @@
         "path": "third_party/tt_llk_wormhole_b0",
         "files": "*"
     },
+    "third_party_blackhole": {
+        "path": "third_party/tt_llk_blackhole",
+        "files": "*"
+    },
     "kernel_gen": {
         "path": "build/src/ckernels/gen/out",
         "files": "*",
@@ -140,7 +144,7 @@
     },
 }
 
-if "BACKEND_ARCH_NAME" in os.environ and os.environ["BACKEND_ARCH_NAME"] == "wormhole_b0":
+if "BACKEND_ARCH_NAME" in os.environ and os.environ["BACKEND_ARCH_NAME"] == "wormhole_b0" or os.environ["BACKEND_ARCH_NAME"] == "blackhole":
     bbe_files["firmware_erisc_hex"] = {
         "path": "build/src/firmware/riscv/targets/erisc_app/out",
         "files": [

From 6238cf82ce6ebb11b3c821f4944cde86dd362ad2 Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Wed, 12 Jun 2024 06:07:11 +0000
Subject: [PATCH 09/29] Add skipped codegen model variants in wh_b0 and gs

(cherry picked from commit 6c995433b4b4164efa3d3fc4b6aa8a0db07e5107)
---
 .../high_prio/nlp/pytorch/test_codegen.py     | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
index 5fce40d7..a5f9ad10 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
@@ -17,27 +17,37 @@
 
 variants = [
     "Salesforce/codegen-350M-mono",
-    # "Salesforce/codegen-350M-multi", # Currently not supported
-    # "Salesforce/codegen-350M-nl", # Currently not supported
+    "Salesforce/codegen-350M-multi",
+    "Salesforce/codegen-350M-nl",
 ]
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_codegen(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_tvm_cpu_fallback = False
-    compiler_cfg.default_dram_parameters = False
-    compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{32*1024}"
-    pcc = 0.98
-    if test_device.arch == BackendDevice.Grayskull:
-        compiler_cfg.default_dram_parameters = False
-        compiler_cfg.balancer_policy = "Ribbon"
-        pcc = 0.96 if test_device.devtype == BackendType.Silicon else 0.98
-    # DRAM stream limit
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.balancer_op_override("matmul_1829", "grid_shape", (2, 8))
 
+    pcc_value = 0.99
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        if test_device.devtype == BackendType.Silicon:
+            if variant == "Salesforce/codegen-350M-multi":
+                pcc_value = 0.96
+            elif variant == "Salesforce/codegen-350M-nl":
+                pcc_value = 0.95
+    elif test_device.arch == BackendDevice.Grayskull:
+        if test_device.devtype == BackendType.Silicon:
+            if variant == "Salesforce/codegen-350M-mono":
+                pcc_value = 0.96
+            elif variant == "Salesforce/codegen-350M-multi":
+                pcc_value = 0.93
+            elif variant == "Salesforce/codegen-350M-nl":
+                compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16
+                pcc_value = 0.90
+
     # Load model (with tokenizer)
     tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
     tokenizer.add_special_tokens({"pad_token": "[PAD]"})
@@ -75,7 +85,7 @@ def forward(self, input_ids, attention_mask):
     attn_mask = attn_mask.to(torch.float32)
     out = framework_model(input_ids, attn_mask)
 
-    pybuda_model = pybuda.PyTorchModule("pt_codegen", framework_model)
+    pybuda_model = pybuda.PyTorchModule("pt_"+str(variant.split("/")[-1].replace("-", "_")), framework_model)
     verify_module(
         pybuda_model,
         input_shapes=[(input_ids.shape, attn_mask.shape,)],
@@ -86,6 +96,6 @@ def forward(self, input_ids, attention_mask):
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
             chip_ids=NebulaGalaxy.chip_ids if "PYBUDA_NEB_GALAXY_CI" in os.environ and int(os.environ.get("PYBUDA_NEB_GALAXY_CI"))==1 else [0],
-            pcc=pcc,
+            pcc=pcc_value,
         ),
     )

From f1fa8c993396749f2b5248cde87fad23de952eff Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Fri, 14 Jun 2024 11:40:12 +0000
Subject: [PATCH 10/29] Add test for IndexCopy operator

(cherry picked from commit 123f62088e5b64f4c457352ebc4ffdc37f3b08cb)
---
 pybuda/test/operators/nary/test_index_copy.py | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 pybuda/test/operators/nary/test_index_copy.py

diff --git a/pybuda/test/operators/nary/test_index_copy.py b/pybuda/test/operators/nary/test_index_copy.py
new file mode 100644
index 00000000..677021ca
--- /dev/null
+++ b/pybuda/test/operators/nary/test_index_copy.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for testing of stack operator
+
+
+
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (-)  2.1 From another op
+#       - Operator -> input
+# (-)  2.2 From tm edge
+#       - Combination: operator -> tm -> input
+#       - tm -> input
+# (-)  2.3 From DRAM queue
+#       - input_queue flag = false
+#       - Special case of From host? May it be triggered if the operator is not the first node of the network?
+#       - Can this be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# (-)  2.4 From DRAM, but prologued (constant)
+#       - Constants must be small enough to fit into L1
+#       - Verification via netlists that scenario is triggered
+#       - Input are not prologued for microbatch size = 1
+# (-)  2.5 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants. Does it make difference if tensor is big > L1
+#       - Verification via netlists that scenario is triggered???
+# (-)  2.6 From host
+#       - Input tensor as input of network -> Operator is first node in network and input_queue flag = true
+#       - Can this scenario be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# 3 Operand shapes type(s):
+# (-)  3.1 Full tensor (i.e. full expected shape)
+#       - Is 3 dims max for all ops? Ex. Conv is 3d max
+# (-)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (-)  3.3 Scalar
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (-)  4.1 Divisible by 32
+# (-)  4.2 Prime numbers
+# (-)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (-)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (-)  5.1 Output DF
+# (-)  5.2 Intermediate DF
+# (-)  5.3 Accumulation DF
+# (-)  5.4 Operand DFs
+# (-) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+# (-) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+
+
+import pytest
+
+import pybuda
+import torch
+
+from pybuda import PyBudaModule, VerifyConfig
+from pybuda.config import _get_global_compiler_config
+from pybuda.verify import TestKind, verify_module
+
+# IndexCopy operator in PyBuda works in case of index is vector of one element
+def test_index_copy_torch_and_buda_1():
+
+    zeros_torch = torch.zeros(6, 3)
+
+    x_torch = zeros_torch
+
+    dim_torch = 0
+    index_torch = torch.tensor([2], dtype=torch.long)
+    source_torch = torch.tensor([[10, 10, 10]], dtype=torch.float)
+
+    # print(f"\nx_torch before:\n{x_torch}")
+    x_torch.index_copy_(dim_torch, index_torch, source_torch)
+    # print(f"\nx_torch after:\n{x_torch}")
+
+    # setting operands for pybuda:
+    operandA = pybuda.tensor.Tensor.create_from_torch(zeros_torch)
+    index = pybuda.tensor.Tensor.create_from_torch(index_torch)
+    value = pybuda.tensor.Tensor.create_from_torch(source_torch)
+    dim_buda = dim_torch
+
+    result_buda = pybuda.op.IndexCopy("IndexCopy0", operandA, index, value, dim_buda)
+    # print(f"\nresult_buda:\n{result_buda}")
+
+    output_are_the_same = torch.eq(x_torch, pybuda.tensor.Tensor.to_pytorch(result_buda)).all()
+    assert output_are_the_same
+
+
+# Case of IndexCopy operator is not working
+# In PyTorch, index can be tensor of any shape, but in PyBuda, it can be only vector of one element
+@pytest.mark.xfail(reason="IndexCopy operator does not work")
+def test_index_copy_torch_and_buda_2():
+
+    zeros_torch = torch.zeros(6, 3)
+
+    x_torch = zeros_torch
+
+    dim_torch = 0
+    index_torch = torch.tensor([0, 2], dtype=torch.long)
+    source_torch = torch.tensor([[10, 10, 10], [20, 20, 20]], dtype=torch.float)
+
+    print(f"\nx_torch before:\n{x_torch}")
+    x_torch.index_copy_(dim_torch, index_torch, source_torch)
+    print(f"\nx_torch after:\n{x_torch}")
+
+    # setting operands for pybuda:
+    operandA = pybuda.tensor.Tensor.create_from_torch(zeros_torch)
+    index = pybuda.tensor.Tensor.create_from_torch(index_torch)
+    value = pybuda.tensor.Tensor.create_from_torch(source_torch)
+
+    result_buda = pybuda.op.IndexCopy("IndexCopy0", operandA, index, value, 0)
+    print(f"\nresult_buda:\n{result_buda}")
+
+    output_are_the_same = torch.eq(x_torch, pybuda.tensor.Tensor.to_pytorch(result_buda)).all()
+    assert output_are_the_same
+
+
+# IndexCopy operator in PyBuda is not working while testing it via model
+# Running test on grayskull and wormhole devices but in both cases, it is failing with the same error:
+# "...
+#    WARNING  | Always          - Unsupported HW op: IndexCopy0 index_copy(axis: 0)
+#    WARNING  | pybuda.compile:run_balancer_and_placer:949 - Found unsupported HW ops, stopping compilation early:
+#    IndexCopy0 index_copy(axis: 0)
+#    
+#    ERROR    | pybuda.device:run_next_command:469 - Compile error: 'consteval_trace'
+#  ...
+# "
+@pytest.mark.parametrize("input_shape", [(2, 3, 3)])
+@pytest.mark.xfail(reason="IndexCopy operator does not work on any device.")
+def test_index_copy_via_model(test_device, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+            self.add_constant("index")
+            self.set_constant("index", pybuda.tensor.Tensor.create_from_torch(torch.tensor([0]), dev_data_format = pybuda.DataFormat.UInt16, constant=True))
+
+        def forward(self, x, y):
+            operandA = x
+            index = self.get_constant("index")
+            value = y
+            dim = 0
+            output = pybuda.op.IndexCopy("IndexCopy0", operandA, index, value, dim)
+            return output
+        
+    mod = Model("test_index_copy_via_model_model")
+    input_shapes = (input_shape,) + (((1,) + input_shape[1:]),)
+    # print(f"\n\n\n**********************  input_shapes: {input_shapes}  ***************************\n\n\n")
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
\ No newline at end of file

From 3870713a40f098c86f5613d1876fedf6b7c22b3a Mon Sep 17 00:00:00 2001
From: Konstantin Milanovic <kmilanovic@tenstorrent.com>
Date: Tue, 4 Jun 2024 17:40:45 +0000
Subject: [PATCH 11/29] Validate netlist for matmul

(cherry picked from commit 0e5401efc45603a3c03022bddb1c0c48f02c69d3)
---
 pybuda/test/operators/matmul/test_matmul.py | 118 +++++++++++++++++++-
 1 file changed, 116 insertions(+), 2 deletions(-)

diff --git a/pybuda/test/operators/matmul/test_matmul.py b/pybuda/test/operators/matmul/test_matmul.py
index 9158a9c5..7f4c28b3 100644
--- a/pybuda/test/operators/matmul/test_matmul.py
+++ b/pybuda/test/operators/matmul/test_matmul.py
@@ -84,6 +84,8 @@
 
 from pybuda.op.eval.common import compare_tensor_to_golden
 
+from test.operators.utils import netlist_utils
+
 from .models import generic
 from .models import custom
 from .models import special_cases
@@ -234,7 +236,8 @@ def test_matmul_according_to_pytorch_docs(
 
 def get_input_shapes(microbatch_size1=1, microbatch_size2=1):
                                               # Here we cover interesting combinations of input shapes:
-    return [(microbatch_size1, microbatch_size2, 3, 4),         # 3.1 Full tensor (i.e. full expected shape)
+    return [
+            (microbatch_size1, microbatch_size2, 3, 4),         # 3.1 Full tensor (i.e. full expected shape)
             (microbatch_size1, microbatch_size2, 45, 17),        # 3.1 Full tensor (i.e. full expected shape)
             (microbatch_size1, microbatch_size2, 1, 23),        # 3.2 Tensor reduce on one or more dims to 1
             (microbatch_size1, microbatch_size2, 64, 1),        # 3.2 Tensor reduce on one or more dims to 1
@@ -265,7 +268,7 @@ def get_input_shapes(microbatch_size1=1, microbatch_size2=1):
 
 # test matmul in all cases according to test plan
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_TEST_PLAN_PATH) if "model" in item])
-@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size1=1))
+@pytest.mark.parametrize("input_shape", get_input_shapes())
 def test_matmul_according_to_test_plan(
     model,
     input_shape,
@@ -273,6 +276,8 @@ def test_matmul_according_to_test_plan(
     input_params=[], 
     math_fidelity=None
 ):
+    if(model == "model_op_src_const_inputs2" and math_fidelity == None):
+        pytest.skip() # this model has its own test: test_matmul_dram_prologued
 
     #BUG: when input shape is (1, 1, 10000, 1) - extreme ratios between height/width; it works for input shape when one dimension is 9920 or less, everything above(like 10000) throws error
     if (input_shape == (1, 1, 10000, 1) or input_shape == (1, 10000, 1)) and model in (
@@ -326,10 +331,14 @@ def test_matmul_according_to_test_plan(
 
     match model:
         case "model_op_src_from_dram1":
+            input_shape = (1,) + input_shape[1:]
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
         case "model_op_src_const_inputs1": 
+            input_shape = (1,) + input_shape[1:]
+            tr = (1,) + tr[1:]
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape}, {tr})'
         case "model_op_src_const_inputs2":
+            input_shape = (1,) + input_shape[1:]
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
         case _:
             architecture = f'test_plan.{model}.BudaMatmulTest()'
@@ -358,6 +367,111 @@ def test_matmul_according_to_test_plan(
         input_params=[input_params],
     )
 
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    match model:
+        case "model_op_src_from_dram2":
+            assert netlist_utils.read_netlist_value(file_path, "/queues/x1/loc") == 'dram'
+            assert netlist_utils.read_netlist_value(file_path, "/queues/x2/loc") == 'dram'
+        case "model_op_src_const_inputs1":
+            d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+            for key in d.keys():
+                assert "Matmul" not in key
+
+
+
+def get_input_shapes_prologued():
+                                              # Here we cover interesting combinations of input shapes:
+    return [
+            ((2, 3, 4),         True, False),  #0        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 4),         False, True),  #1        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 4),         None, True),   #2        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         True, False),  #3        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         False, True),  #4        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         None, True),   #5        # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 45, 17),       None, True),   #6        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 23),        None, True),   #7        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 1),        None, True),   #8        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 100, 100),     None, True),   #9        # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1000, 100),    None, True),   #10       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 10, 1000),     None, True),   #11       # 4.4 Extreme ratios between height/width
+            ((2, 9920, 1),      None, True),   #12       # 4.4 Extreme ratios between height/width
+            ((2, 10000, 1),     None, False),  #13       # 4.4 Extreme ratios between height/width
+            ((2, 32, 64),       None, True),   #14       # 4.1 Divisible by 32
+            ((2, 160, 96),      None, True),   #15       # 4.1 Divisible by 32
+            ((2, 17, 41),       None, True),   #16       # 4.2 Prime numbers
+            ((2, 89, 3),        None, True),   #17       # 4.2 Prime numbers
+
+            ((2, 1, 3, 4),      True, False),  #18       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 3, 4),      False, True),  #19       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 3, 4),      None, True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      True, False),  #21       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      False, True),  #22       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      None, True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 1, 45, 17),    None, True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 1, 23),     None, True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 1, 64, 1),     None, True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 1, 100, 100),  None, True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 1000, 100), None, True) ,  #28       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 10, 1000),  None, True) ,  #29       # 4.4 Extreme ratios between height/width
+            ((2, 1, 9920, 1),   None, True) ,  #30       # 4.4 Extreme ratios between height/width 
+            ((2, 1, 10000, 1),  None, True) ,  #31       # 4.4 Extreme ratios between height/width   
+            ((2, 1, 32, 64),    None, True) ,  #32       # 4.1 Divisible by 32
+            ((2, 1, 160, 96),   None, True) ,  #33       # 4.1 Divisible by 32
+            ((2, 1, 17, 41),    None, True) ,  #34       # 4.2 Prime numbers
+            ((2, 1, 89, 3),     None, True) ,  #35       # 4.2 Prime numbers
+            ]
+
+@pytest.mark.parametrize("input_shape, default_dram_params, prologue", get_input_shapes_prologued())
+def test_matmul_dram_prologued(
+    input_shape,
+    default_dram_params,
+    prologue,
+    test_device,
+):
+    model = "model_op_src_const_inputs2"
+    #BUG: when input shape is (2, 1, 10000, 1) or (2, 10000, 1) - extreme ratios between height/width; it works for input shape when one dimension is 9920 or less, everything above(like 10000) throws error
+    if (input_shape == (2, 1, 10000, 1) or input_shape == (2, 10000, 1)) and model == "model_op_src_const_inputs2":
+        pytest.xfail("Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:")
+   
+    # generate input shapes
+    opernad_num = 0
+    tr_operand_num = 1
+    if(len(input_shape) == 3):
+        tr = (input_shape[0],input_shape[2],input_shape[1])
+    else:
+        tr = (input_shape[0],input_shape[1],input_shape[3],input_shape[2])
+    input_shapes = list([input_shape for _ in range(opernad_num)])
+    for _ in range(tr_operand_num):
+        input_shapes.append(tr) 
+    input_shapes = tuple(input_shapes)
+
+    input_shape = (1,) + input_shape[1:]
+
+    architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
+    model_eval = eval(architecture)
+
+    # set compiler config file
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.enable_training = False
+    compiler_cfg.input_queues_on_host = False
+    compiler_cfg.default_dram_parameters = default_dram_params
+    
+    verify_module(
+        model_eval,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+    )
+
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_mm1")
+    if prologue:
+        assert d['prologue']
+    else:
+        assert not d['prologue']
 
 
 def get_input_shape(microbatch_size1=1, microbatch_size2=1):

From f1b556346ab755da33e79c0253a4bd91029f6daf Mon Sep 17 00:00:00 2001
From: dgolubovic <dgolubovic@tenstorrent.com>
Date: Thu, 13 Jun 2024 10:22:20 +0000
Subject: [PATCH 12/29] [padding] add queue instead of padding if adding only
 queue fixes all failures.

(cherry picked from commit 12352f59ab050fac86563829d79aea5f3ea7f6ef)
---
 pybuda/csrc/balancer/exceptions.hpp        |  9 +++
 pybuda/csrc/passes/padding_pass_placer.cpp | 84 +++++++++-------------
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/pybuda/csrc/balancer/exceptions.hpp b/pybuda/csrc/balancer/exceptions.hpp
index 755d8959..6308062a 100644
--- a/pybuda/csrc/balancer/exceptions.hpp
+++ b/pybuda/csrc/balancer/exceptions.hpp
@@ -57,6 +57,15 @@ class BudaOpNodeLegalizerFailureInfo
         return opModelFailureCountByType[failureReason];
     }
 
+    // Returns the total number of failures targeted by padding. Padding aims to resolve these failures.
+    std::uint32_t getFailuresCountTargetedByPadding() const
+    {
+        return opModelFailureCountByType[UserAccessPreventsStreaming] +
+               opModelFailureCountByType[OperandAccessPreventsStreaming] +
+               opModelFailureCountByType[OperandAndUserAccessPreventsStreaming] +
+               opModelFailureCountByType[InputBufferAllocationFailure];
+    }
+
     std::string toString() const
     {
         std::string result = "Op model failure counts by type: \n";
diff --git a/pybuda/csrc/passes/padding_pass_placer.cpp b/pybuda/csrc/passes/padding_pass_placer.cpp
index 2f711b20..310350ff 100644
--- a/pybuda/csrc/passes/padding_pass_placer.cpp
+++ b/pybuda/csrc/passes/padding_pass_placer.cpp
@@ -143,8 +143,7 @@ bool run_padding_loop(
     Padding &padding,
     const BudaOpNodeLegalizerFailureInfo &failure_info,
     const BalancerConfig &balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection,
-    bool queue_fixes_failures)
+    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
 {
     const int PADDING_TRY_MAX = 10;
     bool padded_node = false;
@@ -225,20 +224,6 @@ bool run_padding_loop(
                     padding.pad_rhs_ct);
                 // If we added queue and also padded the node, we want to check if only adding the queue had solved
                 // the failures (queue_fixes_failures). If it did, we remove padding and keep the queue.
-                bool has_padding_queue = node_has_padding_queue(graph, node);
-                if (has_padding_queue && queue_fixes_failures)
-                {
-                    log_debug(
-                        LogPadding,
-                        "Node {} has padding queue on output edge but it's also padded. In this case only queue is "
-                        "enough.",
-                        node->name());
-                    remove_padding(graph, node, padding);
-                    insert_queue(graph, node);
-                    std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
-                        check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-                    TT_ASSERT(failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
-                }
                 padded_node |= padded_loop;
                 break;
             }
@@ -247,41 +232,28 @@ bool run_padding_loop(
 
     if (!no_failures)
     {
-        // if we have failures after padding loop, we still have few things that can make the node legal.
-        // If queue_fixes_failures is true, this means that adding queue without padding the node fixes the failures.
-        if (queue_fixes_failures)
+        // If we have failures, but  buffer_alloc_cnt == 0, then we can try to handle other failures by adding queue
+        // on output edge. In some cases this, along with padding will make node legal. However, if we don't even
+        // have buffer_alloc_cnt == 0, we remove padding and give up.
+        if (buffer_alloc_cnt > 0)
         {
+            // After padding loop we still have input buffer allocation count issues.
+            log_warning(
+                LogPadding,
+                "Couldn't find padding for node: {} after {} iterations.",
+                node->name(),
+                padding_try_it);
+            // We remove padding only if it didn't solve input buffer allocation issues.
             remove_padding(graph, node, padding);
-            insert_queue(graph, node);
-            std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
-                check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-            TT_ASSERT(failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
+            // unsuccessful padding for node
+            return false;
         }
-        else
-        {
-            // If we have failures, but  buffer_alloc_cnt == 0, then we can try to handle other failures by adding queue
-            // on output edge. In some cases this, along with padding will make node legal. However, if we don't even
-            // have buffer_alloc_cnt == 0, we remove padding and give up.
-            if (buffer_alloc_cnt > 0)
-            {
-                // After padding loop we still have input buffer allocation count issues.
-                log_warning(
-                    LogPadding,
-                    "Couldn't find padding for node: {} after {} iterations.",
-                    node->name(),
-                    padding_try_it);
-                // We remove padding only if it didn't solve input buffer allocation issues.
-                remove_padding(graph, node, padding);
-                // unsuccessful padding for node
-                return false;
-            }
 
-            if (user_access_cnt > 0 || operand_and_user_access_cnt > 0)
-            {
-                // Inserting queue helps with user access failures but can also solve input buffer allocation issues.
-                insert_queue(graph, node);
-                padded_node = true;
-            }
+        if (user_access_cnt > 0 || operand_and_user_access_cnt > 0)
+        {
+            // Inserting queue helps with user access failures but can also solve input buffer allocation issues.
+            insert_queue(graph, node);
+            padded_node = true;
         }
     }
     return padded_node;
@@ -333,10 +305,24 @@ bool pad_pass_placer(
         bool queue_fixes_failures =
             check_if_queue_fixes_failures(graph, node, balancer_config, balancer_cache_collection);
         log_trace(LogPadding, "For node {}, queue after node fixes failures: {}", node->name(), queue_fixes_failures);
+        if (queue_fixes_failures && failure_info.getFailuresCountTargetedByPadding())
+        {
+            // add queue instead of padding if there are failures related to padding and queue fixes them.
+            log_debug(
+                LogPadding,
+                "Node {} becomes legal after adding queue. Adding queue instead of padding.",
+                node->name());
+            insert_queue(graph, node);
+            std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> failures =
+                check_node_legality(graph, node, balancer_config, balancer_cache_collection);
+            TT_ASSERT(failures.size() == 0, "Adding queue is expected to fix all failures in this situation");
+            node->as<TaggedNode>()->add_tags({ { "padding", true } });
+            continue;
+        }
 
         // Try padding the node with nop on output edges.
         bool padded_node = run_padding_loop(
-            graph, node, padding, failure_info, balancer_config, balancer_cache_collection, queue_fixes_failures);
+            graph, node, padding, failure_info, balancer_config, balancer_cache_collection);
 
         if (!padded_node)
         {
@@ -347,7 +333,7 @@ bool pad_pass_placer(
             padding.orig_shape = orig_shape;
             padding.add_queues_on_output = true;
             padded_node = run_padding_loop(
-                graph, node, padding, failure_info, balancer_config, balancer_cache_collection, queue_fixes_failures);
+                graph, node, padding, failure_info, balancer_config, balancer_cache_collection);
         }
 
         // padded will be true if we padded successfully at least one node from the nodes_to_pad.

From 93aeccdf5bb567b44629be8cd6c718e330227907 Mon Sep 17 00:00:00 2001
From: Vladimir Canic <vcanic@tenstorrent.com>
Date: Sun, 16 Jun 2024 07:09:12 +0000
Subject: [PATCH 13/29] Switch off NOC bandwidth estimates for T5 and flan-T5
 benchmark models.

(cherry picked from commit 9fd065c70bdaed2900dc3d3e0f3d74754bb383b3)
---
 pybuda/test/benchmark/benchmark/models/t5.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pybuda/test/benchmark/benchmark/models/t5.py b/pybuda/test/benchmark/benchmark/models/t5.py
index db21a570..81d57d87 100644
--- a/pybuda/test/benchmark/benchmark/models/t5.py
+++ b/pybuda/test/benchmark/benchmark/models/t5.py
@@ -26,6 +26,10 @@ def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, da
         # Disable DRAM BW estimates.
         os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
 
+    if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
+
     # These are about to be enabled by default.
     #
     os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
@@ -71,6 +75,10 @@ def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: st
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
 
+    if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
+
     # These are about to be enabled by default.
     #
     os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"

From 9d4a2b93722e5483819eac5963bf75af761ee2b4 Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan <akannan@tenstorrent.com>
Date: Mon, 17 Jun 2024 11:52:27 +0000
Subject: [PATCH 14/29] Add monodle model demos for PyTorch in Wormhole &
 Grayskull

(cherry picked from commit 711927afad3e6d94d3c8cccf61344e03f475a0f1)
---
 pybuda/pybuda/tvm_to_python.py                |   8 +-
 .../high_prio/cnn/pytorch/test_monodle.py     |  62 ++
 pybuda/test/model_demos/models/monodle.py     | 541 ++++++++++++++++++
 3 files changed, 610 insertions(+), 1 deletion(-)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
 create mode 100644 pybuda/test/model_demos/models/monodle.py

diff --git a/pybuda/pybuda/tvm_to_python.py b/pybuda/pybuda/tvm_to_python.py
index 91b47921..e686013d 100644
--- a/pybuda/pybuda/tvm_to_python.py
+++ b/pybuda/pybuda/tvm_to_python.py
@@ -632,7 +632,13 @@ def populate_conv2d_transpose_args(graph, nid, compiler_cfg):
     assert all([x == 1 for x in dilation]), "Only supports dilation of 1"
     args.append(("dilation", f"{dilation[0]}",))
 
-    in_channel = next((n['attrs']['shape'][0][0][0] for n in graph['nodes'] if n['name'] == 'model.weight'), None)
+    in_channel = None
+    for input_ in node["inputs"]:
+        input_nid = input_[0]
+        input_node = graph["nodes"][input_nid]
+        if input_node["op"] == "parameter" and input_node["name"].endswith(".weight"):
+            in_channel = input_node["attrs"]["shape"][0][0][0]
+            break
     groups = int(node["attrs"]["groups"][0][0])
     assert groups == 1 or (in_channel is not None and groups == in_channel), "Only supports group of 1 or in_channel"
     args.append(("groups", f"{groups}",))
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
new file mode 100644
index 00000000..9b0a2958
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
@@ -0,0 +1,62 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify import TestKind
+import os
+import requests
+import torchvision.transforms as transforms
+from PIL import Image
+from test.model_demos.models.monodle import CenterNet3D
+
+
+def test_monodle_pytorch(test_device):
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    pcc = 0.99
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{88*1024}"
+        pcc = 0.98
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
+        pcc = 0.93
+
+    model_name = "monodle_pytorch"
+
+    # Load data sample
+    url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    # Preprocessing
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    img_tensor = transform(image).unsqueeze(0)
+
+    pytorch_model = CenterNet3D(backbone="dla34")
+    pytorch_model.eval()
+
+    tt_model = pybuda.PyTorchModule(model_name, pytorch_model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[img_tensor.shape],
+        inputs=[(img_tensor,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/model_demos/models/monodle.py b/pybuda/test/model_demos/models/monodle.py
new file mode 100644
index 00000000..39c9c741
--- /dev/null
+++ b/pybuda/test/model_demos/models/monodle.py
@@ -0,0 +1,541 @@
+"""
+Code adapted from https://github.com/xinzhuma/monodle.git
+
+MIT License
+
+Copyright (c) 2021 XINZHU.MA
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+
+import torch
+import os
+import torch.nn as nn
+import math
+import torch.utils.model_zoo as model_zoo
+import numpy as np
+from collections import OrderedDict
+
+
+def get_model_url(data="imagenet", name="dla34", hash="ba72cf86"):
+    return os.path.join(
+        "http://dl.yf.io/dla/models", data, "{}-{}.pth".format(name, hash)
+    )
+
+
+BatchNorm = nn.BatchNorm2d
+
+
+class Conv2d(nn.Module):
+    def __init__(self, in_planes, out_planes, kernal_szie=3, stride=1, bias=True):
+
+        super(Conv2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernal_szie,
+            stride=stride,
+            padding=kernal_szie // 2,
+            bias=bias,
+        )
+        self.bn = BatchNorm(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+# weight init for up-sample layers [tranposed conv2d]
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2.0 * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUp(nn.Module):
+    """
+    input: features map of different layers
+    output: up-sampled features
+    """
+
+    def __init__(self, in_channels_list, up_factors_list, out_channels):
+
+        super(IDAUp, self).__init__()
+        self.in_channels_list = in_channels_list
+        self.out_channels = out_channels
+
+        for i in range(1, len(in_channels_list)):
+            in_channels = in_channels_list[i]
+            up_factors = int(up_factors_list[i])
+
+            proj = Conv2d(
+                in_channels, out_channels, kernal_szie=3, stride=1, bias=False
+            )
+            node = Conv2d(
+                out_channels * 2, out_channels, kernal_szie=3, stride=1, bias=False
+            )
+            up = nn.ConvTranspose2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=up_factors * 2,
+                stride=up_factors,
+                padding=up_factors // 2,
+                output_padding=0,
+                groups=out_channels,
+                bias=False,
+            )
+            fill_up_weights(up)
+
+            setattr(self, "proj_" + str(i), proj)
+            setattr(self, "up_" + str(i), up)
+            setattr(self, "node_" + str(i), node)
+
+        # weight init
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, layers):
+
+        assert len(self.in_channels_list) == len(layers), "{} vs {} layers".format(
+            len(self.in_channels_list), len(layers)
+        )
+
+        for i in range(1, len(layers)):
+            upsample = getattr(self, "up_" + str(i))
+            project = getattr(self, "proj_" + str(i))
+            node = getattr(self, "node_" + str(i))
+
+            layers[i] = upsample(project(layers[i]))
+            layers[i] = node(torch.cat([layers[i - 1], layers[i]], 1))
+
+        return layers
+
+
+class DLAUp(nn.Module):
+    def __init__(self, in_channels_list, scales_list=(1, 2, 4, 8, 16)):
+
+        super(DLAUp, self).__init__()
+        scales_list = np.array(scales_list, dtype=int)
+
+        for i in range(len(in_channels_list) - 1):
+            j = -i - 2
+            setattr(
+                self,
+                "ida_{}".format(i),
+                IDAUp(
+                    in_channels_list=in_channels_list[j:],
+                    up_factors_list=scales_list[j:] // scales_list[j],
+                    out_channels=in_channels_list[j],
+                ),
+            )
+            scales_list[j + 1 :] = scales_list[j]
+            in_channels_list[j + 1 :] = [
+                in_channels_list[j] for _ in in_channels_list[j + 1 :]
+            ]
+
+    def forward(self, layers):
+
+        layers = list(layers)
+        assert len(layers) > 1
+        for i in range(len(layers) - 1):
+            ida = getattr(self, "ida_{}".format(i))
+            layers[-i - 2 :] = ida(layers[-i - 2 :])
+        return layers[-1]
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            bias=False,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(
+        self,
+        levels,
+        block,
+        in_channels,
+        out_channels,
+        stride=1,
+        level_root=False,
+        root_dim=0,
+        root_kernel_size=1,
+        dilation=1,
+        root_residual=False,
+    ):
+
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1, dilation=dilation)
+        else:
+            self.tree1 = Tree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+            self.tree2 = Tree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size, root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, bias=False
+                ),
+                BatchNorm(out_channels),
+            )
+
+    def forward(self, x, residual=None, children=None):
+
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(
+        self,
+        levels,
+        channels,
+        num_classes=1000,
+        block=BasicBlock,
+        residual_root=False,
+        return_levels=False,
+        pool_size=7,
+        linear_root=False,
+    ):
+
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True),
+        )
+        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2
+        )
+        self.level2 = Tree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            root_residual=residual_root,
+        )
+        self.level3 = Tree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level4 = Tree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level5 = Tree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+        self.fc = nn.Conv2d(
+            channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend(
+                [
+                    nn.Conv2d(
+                        inplanes,
+                        planes,
+                        kernel_size=3,
+                        stride=stride if i == 0 else 1,
+                        padding=dilation,
+                        bias=False,
+                        dilation=dilation,
+                    ),
+                    BatchNorm(planes),
+                    nn.ReLU(inplace=True),
+                ]
+            )
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, "level{}".format(i))(x)
+            y.append(x)
+        if self.return_levels:
+            return y
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+            return x
+
+    def load_pretrained_model(self, data="imagenet", name="dla34", hash="ba72cf86"):
+        fc = self.fc
+        if name.endswith(".pth"):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1],
+            num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.load_state_dict(model_weights, strict=False)
+        self.fc = fc
+
+
+def dla34(pretrained=False, **kwargs):  # DLA-34
+
+    model = DLA(
+        [1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock, **kwargs
+    )
+    if pretrained:
+        model.load_pretrained_model(data="imagenet", name="dla34", hash="ba72cf86")
+    return model
+
+
+class CenterNet3D(nn.Module):
+    def __init__(self, backbone="dla34", neck="DLAUp", num_class=3, downsample=4):
+        """
+        CenterNet for monocular 3D object detection.
+        :param backbone: the backbone of pipeline, such as dla34.
+        :param neck: the necks of detection, such as dla_up.
+        :param downsample: the ratio of down sample. [4, 8, 16, 32]
+        :param head_conv: the channels of convolution in head. default: 256
+        """
+
+        assert downsample in [4, 8, 16, 32]
+        super().__init__()
+
+        self.heads = {
+            "heatmap": num_class,
+            "offset_2d": 2,
+            "size_2d": 2,
+            "depth": 2,
+            "offset_3d": 2,
+            "size_3d": 3,
+            "heading": 24,
+        }
+        self.backbone = dla34(pretrained=True, return_levels=True)
+        channels = (
+            self.backbone.channels
+        )  # channels list for feature maps generated by backbone
+        self.first_level = int(np.log2(downsample))
+        scales = [2**i for i in range(len(channels[self.first_level :]))]
+        self.neck = DLAUp(
+            channels[self.first_level :], scales_list=scales
+        )  # feature fusion [such as DLAup, FPN]
+
+        # initialize the head of pipeline, according to heads setting.
+        for head in self.heads.keys():
+            output_channels = self.heads[head]
+            fc = nn.Sequential(
+                nn.Conv2d(
+                    channels[self.first_level], 256, kernel_size=3, padding=1, bias=True
+                ),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    256, output_channels, kernel_size=1, stride=1, padding=0, bias=True
+                ),
+            )
+
+            # initialization
+            if "heatmap" in head:
+                fc[-1].bias.data.fill_(-2.19)
+            else:
+                self.fill_fc_weights(fc)
+
+            self.__setattr__(head, fc)
+
+    def forward(self, input):
+
+        feat = self.backbone(input)
+        feat = self.neck(feat[self.first_level :])
+
+        ret = OrderedDict()
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(feat)
+
+        return ret
+
+    def fill_fc_weights(self, layers):
+        for m in layers.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)

From d9c8f7dd1eb9ca92777f32a992cc31e8a5efca9a Mon Sep 17 00:00:00 2001
From: Vladimir Canic <vcanic@tenstorrent.com>
Date: Tue, 18 Jun 2024 05:02:03 +0000
Subject: [PATCH 15/29] Disable data movement, DRAM and NOC, estimates by
 default.

(cherry picked from commit a15c8be20a843abed134cf458ef4dcc6bb67a0ce)
---
 pybuda/csrc/balancer/policies/policy_utils.cpp          | 4 ++--
 pybuda/test/benchmark/benchmark/models/deit.py          | 2 --
 pybuda/test/benchmark/benchmark/models/mobilenet_v2.py  | 3 +++
 pybuda/test/benchmark/benchmark/models/openpose_hand.py | 3 +++
 pybuda/test/benchmark/benchmark/models/resnet.py        | 3 +++
 pybuda/test/benchmark/benchmark/models/t5.py            | 8 --------
 pybuda/test/benchmark/benchmark/models/unet.py          | 2 --
 pybuda/test/benchmark/benchmark/models/vit.py           | 2 --
 8 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/pybuda/csrc/balancer/policies/policy_utils.cpp b/pybuda/csrc/balancer/policies/policy_utils.cpp
index cbd0fc68..fb781309 100644
--- a/pybuda/csrc/balancer/policies/policy_utils.cpp
+++ b/pybuda/csrc/balancer/policies/policy_utils.cpp
@@ -1880,10 +1880,10 @@ OpCycleEstimates get_op_cycles_estimates(
     static const bool disable_model_kb_prologue_bw = env_as<bool>("PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW", false);
 
     // Should we use estimates for the NOC bandwidth.
-    static const bool use_noc_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES", true);
+    static const bool use_noc_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES", false);
 
     // Should we use estimates for the DRAM bandwidth.
-    static const bool use_dram_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES", true);
+    static const bool use_dram_bw_estimates = env_as<bool>("PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES", false);
 
     const float inefficency_divider = 2.0;
     const float subchannel_oversub_coeff = 1.5;
diff --git a/pybuda/test/benchmark/benchmark/models/deit.py b/pybuda/test/benchmark/benchmark/models/deit.py
index ff195aa4..b0174267 100644
--- a/pybuda/test/benchmark/benchmark/models/deit.py
+++ b/pybuda/test/benchmark/benchmark/models/deit.py
@@ -25,8 +25,6 @@ def deit(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
-        # Disable NOC BW estimates.
-        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #
diff --git a/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py b/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
index 62858c45..5911c124 100644
--- a/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
+++ b/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
@@ -24,6 +24,9 @@ def mobilenet_v2(training: bool, config: str, microbatch: int, devtype: str, arc
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Enable Data Movement Estimates
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "1"
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "1"
 
     os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
diff --git a/pybuda/test/benchmark/benchmark/models/openpose_hand.py b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
index 52e5b28a..f5738821 100644
--- a/pybuda/test/benchmark/benchmark/models/openpose_hand.py
+++ b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
@@ -28,6 +28,9 @@ def openpose_hand(training: bool, config: str, microbatch: int, devtype: str, ar
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Enable Data Movement Estimates
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "1"
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "1"
 
     os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
diff --git a/pybuda/test/benchmark/benchmark/models/resnet.py b/pybuda/test/benchmark/benchmark/models/resnet.py
index 1504153a..b280b79a 100644
--- a/pybuda/test/benchmark/benchmark/models/resnet.py
+++ b/pybuda/test/benchmark/benchmark/models/resnet.py
@@ -30,6 +30,9 @@ def resnet(training: bool, config: str, microbatch: int, devtype: str, arch: str
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
+        # Enable Data Movement Estimates
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "1"
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "1"
 
     # These are about to be enabled by default.
     #
diff --git a/pybuda/test/benchmark/benchmark/models/t5.py b/pybuda/test/benchmark/benchmark/models/t5.py
index 81d57d87..db21a570 100644
--- a/pybuda/test/benchmark/benchmark/models/t5.py
+++ b/pybuda/test/benchmark/benchmark/models/t5.py
@@ -26,10 +26,6 @@ def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, da
         # Disable DRAM BW estimates.
         os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
 
-    if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
-        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
-
     # These are about to be enabled by default.
     #
     os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
@@ -75,10 +71,6 @@ def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: st
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
 
-    if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
-        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
-
     # These are about to be enabled by default.
     #
     os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
diff --git a/pybuda/test/benchmark/benchmark/models/unet.py b/pybuda/test/benchmark/benchmark/models/unet.py
index 57f7c686..165673ff 100644
--- a/pybuda/test/benchmark/benchmark/models/unet.py
+++ b/pybuda/test/benchmark/benchmark/models/unet.py
@@ -24,8 +24,6 @@ def unet(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
-        # Disable NOC BW estimates.
-        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # Manually enable amp light for Ribbon
     if compiler_cfg.balancer_policy == "Ribbon":
diff --git a/pybuda/test/benchmark/benchmark/models/vit.py b/pybuda/test/benchmark/benchmark/models/vit.py
index 3962bda1..aa011328 100644
--- a/pybuda/test/benchmark/benchmark/models/vit.py
+++ b/pybuda/test/benchmark/benchmark/models/vit.py
@@ -24,8 +24,6 @@ def vit(training: bool, config: str, microbatch: int, devtype: str, arch: str, d
     if data_type == "Bfp8_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
-        # Disable NOC BW estimates.
-        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #

From 4c82da5a66b8ccc3200877020fc333fea1212734 Mon Sep 17 00:00:00 2001
From: Nikola Obradovic <nobradovic@tenstorrent.com>
Date: Sat, 15 Jun 2024 08:30:07 +0000
Subject: [PATCH 16/29] [Balancer/GS] Fix partial datacopy ops related OpModel
 choice mismatch. [perf_wh]

(cherry picked from commit 0cdf10fd5741defa99be8fda8451f29435a6b4c1)
---
 .../csrc/balancer/legalizer/graph_solver.cpp  | 158 +++++++++++++++++-
 .../csrc/balancer/legalizer/graph_solver.hpp  |   8 +
 pybuda/csrc/balancer/legalizer/legalizer.cpp  |  23 +--
 .../csrc/balancer/policies/policy_ribbon2.cpp |  21 ---
 pybuda/csrc/graph_lib/graph.cpp               |   8 +
 pybuda/csrc/graph_lib/graph.hpp               |   4 +
 pybuda/csrc/graph_lib/utils.cpp               |   3 +-
 pybuda/csrc/passes/dataformat.cpp             |  10 +-
 pybuda/csrc/passes/insert_inverse_on_io.cpp   |   8 +-
 .../lower_concat_to_runtime_transform.cpp     |   5 +-
 pybuda/csrc/passes/pad_output_buffer.cpp      |   5 +-
 pybuda/csrc/passes/pre_placer_buda_passes.cpp |  17 +-
 .../tests/test_past_cache_ublock_order.cpp    |   5 +-
 pybuda/test/benchmark/benchmark/models/t5.py  |   2 -
 14 files changed, 193 insertions(+), 84 deletions(-)

diff --git a/pybuda/csrc/balancer/legalizer/graph_solver.cpp b/pybuda/csrc/balancer/legalizer/graph_solver.cpp
index 9e14f4c3..37a69b10 100644
--- a/pybuda/csrc/balancer/legalizer/graph_solver.cpp
+++ b/pybuda/csrc/balancer/legalizer/graph_solver.cpp
@@ -1176,6 +1176,13 @@ void GraphSolver::set(graphlib::Node const* node, OpModel const& op_model, bool
     }
 
     update_solver(node, true /*expand_root*/, true /*invoked_by_set*/);
+
+    if (shared_data->sibling_operands_of_partial_datacopy_output.count(node) > 0)
+    {
+        // Enforce similar op models for sibling operands of partial datacopy output.
+        //
+        op_model_sync_with_sibling_partial_datacopy_ops(node, op_model);
+    }
 }
 
 // Given current epoch ops, runs the overlay model to determine the amount of memory used for each of the ops. Where
@@ -1563,7 +1570,12 @@ std::vector<graphlib::Node*> GraphSolver::buffer(std::vector<BufferInfo>& buffer
                 if (buffer_nop == nullptr)
                 {
                     std::tie(buffer_nop, std::ignore, std::ignore) = graphlib::insert_nop_on_edge(
-                        graph, e, op_name(src, original_dest, buffer_index), true /* is_buffering_op */, buff_info.hoist_tms, false /* remove_edge */);
+                        graph,
+                        e,
+                        op_name(src, original_dest, buffer_index),
+                        true /* is_buffering_op */,
+                        buff_info.hoist_tms,
+                        false /* remove_edge */);
 
                     register_virtual_node(buffer_nop);
                     nodes_to_legalize.insert(buffer_nop);
@@ -1574,7 +1586,14 @@ std::vector<graphlib::Node*> GraphSolver::buffer(std::vector<BufferInfo>& buffer
                     // Reuse the already created buffer nop for all edges between src and dest.
                     // Covers the case when source node is connected with multiple edges to the destination node.
                     // In that case we don't want to create multiple nops, but instead we reuse the same one.
-                    std::tie(std::ignore, std::ignore) = graphlib::insert_node_on_edge(graph, e, buffer_nop, false /* inherit_consumer_attrs */, false /* remove_edge */, 0, not buff_info.hoist_tms);
+                    std::tie(std::ignore, std::ignore) = graphlib::insert_node_on_edge(
+                        graph,
+                        e,
+                        buffer_nop,
+                        false /* inherit_consumer_attrs */,
+                        false /* remove_edge */,
+                        0,
+                        not buff_info.hoist_tms);
                 }
 
                 // Edge cannot be removed right away from the graph as we will affect global state
@@ -1667,9 +1686,14 @@ void GraphSolver::invalidate_streaming_into_output(const std::vector<graphlib::N
         //
         if (node->node_type() == graphlib::NodeType::kOutput)
         {
-            std::function<bool(tt::graphlib::Edge)> is_partial_datacopy_edge = [](Edge e)
-            { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
+
+            if (!partial_datacopy_edges.empty())
+            {
+                graphlib::Node const* datacopy_input = graph->users(node)[0];
+                TT_ASSERT(datacopy_input->node_type() == graphlib::NodeType::kInput);
+                partial_datacopy_edges = graph->operand_partial_datacopy_edges(datacopy_input);
+            }
 
             for (graphlib::Node* operand_node : graph->data_operands(node))
             {
@@ -1678,10 +1702,25 @@ void GraphSolver::invalidate_streaming_into_output(const std::vector<graphlib::N
                     bool no_stream_output_valid = false;
                     const graphlib::BudaOpNode* op_node = static_cast<const graphlib::BudaOpNode*>(operand_node);
 
+                    if (partial_datacopy_edges.size() > 1)
+                    {
+                        // Cache op writing into partial datacopy output node, for sibling opmodel sync.
+                        //
+                        shared_data->sibling_operands_of_partial_datacopy_output.insert(op_node);
+                    }
+
                     // Op model already selected for this node, skip.
                     //
                     if (selected_op_models.count(op_node) > 0)
                     {
+                        // Reapply filter for sibling datacopy op since this is coming from re-resolve.
+                        //
+                        if (partial_datacopy_edges.size() > 1)
+                        {
+                            apply_partial_datacopy_op_model_sync(
+                                partial_datacopy_edges, node, selected_op_models.at(op_node));
+                        }
+
                         continue;
                     }
 
@@ -1994,9 +2033,27 @@ void GraphSolver::invalidate_suboptimal_op_models_for_op(
 
 void GraphSolver::set_filter_grid_size(graphlib::Node const* node, OpModel const& role_op_model)
 {
+    // Op model already selected for this node, skip.
+    //
+    if (selected_op_models.count(node) > 0)
+    {
+        return;
+    }
+
     const std::vector<tt::balancer::OpModel>& op_models = get_legal_op_models(node);
 
+    if (op_models.size() == 1)
+    {
+        return;
+    }
+
     Bitset* node_bitset = get_bitset(node->id());
+
+    if (node_bitset->count() == 1)
+    {
+        return;
+    }
+
     Bitset temp_bitset = *node_bitset;
     std::uint32_t op_model_count = std::min(kNumBitsetBits, std::max(1lu, op_models.size()));
     Bitset discarded_op_models_bitset;
@@ -2039,6 +2096,97 @@ void GraphSolver::set_filter_grid_size(graphlib::Node const* node, OpModel const
     update_solver(node);
 }
 
+void GraphSolver::filter_op_models_for_sibling_partial_datacopy_op(
+    graphlib::Node const* node, OpModel const& role_op_model)
+{
+    // Op model already selected for this node, skip.
+    //
+    if (selected_op_models.count(node) > 0)
+    {
+        return;
+    }
+
+    const std::vector<tt::balancer::OpModel>& op_models = get_legal_op_models(node);
+
+    if (op_models.size() == 1)
+    {
+        return;
+    }
+
+    Bitset* node_bitset = get_bitset(node->id());
+
+    if (node_bitset->count() == 1)
+    {
+        return;
+    }
+
+    Bitset temp_bitset = *node_bitset;
+    std::uint32_t op_model_count = std::min(kNumBitsetBits, std::max(1lu, op_models.size()));
+    Bitset discarded_op_models_bitset;
+    for (size_t i = 0; i < op_model_count; i++)
+    {
+        if (op_models[i].grid_shape != role_op_model.grid_shape ||
+            op_models[i].block_shape().mblock_m != role_op_model.block_shape().mblock_m ||
+            op_models[i].block_shape().mblock_n != role_op_model.block_shape().mblock_n ||
+            op_models[i].block_shape().ublock != role_op_model.block_shape().ublock)
+        {
+            discarded_op_models_bitset.set(i);
+        }
+    }
+
+    if (discarded_op_models_bitset.none())
+    {
+        return;
+    }
+
+    temp_bitset &= ~discarded_op_models_bitset;
+
+    TT_ASSERT(temp_bitset.any());
+
+    *node_bitset = temp_bitset;
+
+    auto it = op_disabled_bitset_cache.find(node->id());
+
+    if (it == op_disabled_bitset_cache.end())
+    {
+        op_disabled_bitset_cache.emplace(node->id(), discarded_op_models_bitset);
+    }
+    else
+    {
+        it->second |= discarded_op_models_bitset;
+    }
+
+    update_solver(node);
+}
+
+void GraphSolver::apply_partial_datacopy_op_model_sync(
+    std::vector<graphlib::Edge> const& partial_datacopy_edges,
+    graphlib::Node const* datacopy_output,
+    OpModel const& op_model)
+{
+    for (graphlib::Edge const& edge : partial_datacopy_edges)
+    {
+        graphlib::Node* producer = graph->node_by_id(edge.producer_node_id);
+        if (producer != datacopy_output)
+        {
+            TT_ASSERT(producer->node_type() == graphlib::NodeType::kOutput);
+            graphlib::Node const* datacopy_op_sibling = graph->data_operands(producer)[0];
+            filter_op_models_for_sibling_partial_datacopy_op(datacopy_op_sibling, op_model);
+        }
+    }
+}
+
+void GraphSolver::op_model_sync_with_sibling_partial_datacopy_ops(graphlib::Node const* node, OpModel const& op_model)
+{
+    graphlib::Node const* datacopy_output = graph->data_users(node)[0];
+    TT_ASSERT(datacopy_output->node_type() == graphlib::NodeType::kOutput);
+    graphlib::Node const* datacopy_input = graph->users(datacopy_output)[0];
+    TT_ASSERT(datacopy_input->node_type() == graphlib::NodeType::kInput);
+    std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_partial_datacopy_edges(datacopy_input);
+
+    apply_partial_datacopy_op_model_sync(partial_datacopy_edges, datacopy_output, op_model);
+}
+
 #ifdef DEBUG
 // Computes and logs if there are valid connections for this edge among paths
 // that were discarded by previously computed edges(edge eliminated by disabling some OpModels).
diff --git a/pybuda/csrc/balancer/legalizer/graph_solver.hpp b/pybuda/csrc/balancer/legalizer/graph_solver.hpp
index 3f435ad9..dea5acda 100644
--- a/pybuda/csrc/balancer/legalizer/graph_solver.hpp
+++ b/pybuda/csrc/balancer/legalizer/graph_solver.hpp
@@ -390,12 +390,20 @@ class GraphSolver
     void invalidate_suboptimal_op_models_for_op(
         const graphlib::BudaOpNode* node, GraphSolverOpModelInvalidationStrategyTier tier);
 
+    void filter_op_models_for_sibling_partial_datacopy_op(graphlib::Node const* node, OpModel const& role_op_model);
+    void op_model_sync_with_sibling_partial_datacopy_ops(graphlib::Node const* node, OpModel const& op_model);
+    void apply_partial_datacopy_op_model_sync(
+        std::vector<graphlib::Edge> const& partial_datacopy_edges,
+        graphlib::Node const* datacopy_output,
+        OpModel const& op_model);
+
     struct SharedData
     {
        public:
         std::unique_ptr<Constraint> constraint;
         std::unordered_map<std::uint64_t, const std::pair<const EdgeCost, const ConstraintFailureReason>>
             constraint_result_cache;
+        std::unordered_set<const graphlib::Node*> sibling_operands_of_partial_datacopy_output;
 
        private:
         LegalOpModels legal_op_models;
diff --git a/pybuda/csrc/balancer/legalizer/legalizer.cpp b/pybuda/csrc/balancer/legalizer/legalizer.cpp
index 73288528..efcfd261 100644
--- a/pybuda/csrc/balancer/legalizer/legalizer.cpp
+++ b/pybuda/csrc/balancer/legalizer/legalizer.cpp
@@ -809,8 +809,7 @@ static std::tuple<int, bool, bool, bool> calculate_user_buffer_factor(
             can_stream_due_to_users = false;
         }
 
-        auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(user_node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(user_node);
         if (user_node->node_type() == graphlib::NodeType::kOutput and partial_datacopy_edges.empty())
         {
             // Host runtime outputs cannot support undoing z major order
@@ -2902,11 +2901,8 @@ static void resolve_input_queue_block_shapes(Graph const* graph, BalancerConfig
                 std::vector<OpModel const*> users;
                 std::vector<OpModel const*> prologue_users;
 
-                auto is_partial_datacopy_edge = [](Edge e)
-                { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-                std::vector<graphlib::Edge> partial_datacopy_edges =
-                    graph->operand_edges(node, is_partial_datacopy_edge);
-                for (auto edge : user_edges)
+                std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_partial_datacopy_edges(node);
+                for (const graphlib::Edge& edge : user_edges)
                 {
                     graphlib::Node* user = graph->node_by_id(edge.consumer_node_id);
                     OpModel const& user_op_model = op_models.at(user->name());
@@ -2977,20 +2973,20 @@ static void resolve_input_queue_block_shapes(Graph const* graph, BalancerConfig
                         auto other_output = graph->node_by_id(edge.producer_node_id);
                         auto other_writeback_op = graph->data_operands(other_output).front();
                         OpModel const& other_op_model = op_models.at(other_writeback_op->name());
-                        TT_ASSERT(
+                        TT_LOG_ASSERT(
                             other_op_model.grid_shape == grid_shape,
                             "Partial datacopy grid shape mismatch on {} and {}",
                             writeback_op->name(),
-                            other_output->name());
+                            other_writeback_op->name());
                         bool block_shapes_match = other_op_model.block_shape().mblock_m == block_shape.mblock_m and
                                                   other_op_model.block_shape().mblock_n == block_shape.mblock_n and
                                                   other_op_model.block_shape().ublock == block_shape.ublock;
-                        TT_ASSERT(
+                        TT_LOG_ASSERT(
                             block_shapes_match,
                             "Partial datacopy block shape mismatch on (note, t's don't have to match)",
                             writeback_op->name(),
                             other_op_model.block_shape(),
-                            other_output->name(),
+                            other_writeback_op->name(),
                             block_shape);
                     }
 
@@ -3180,9 +3176,8 @@ std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> resolve_block_s
     BlockShapeMap block_shape_map;
     for (Node* node : tt::graphlib::topological_sort(*graph))
     {
-        auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
         std::vector<graphlib::Edge> partial_datacopy_operand_edges =
-            graph->operand_edges(node, is_partial_datacopy_edge);
+            graph->operand_partial_datacopy_edges(node);
 
         BlockShape block_shape;
         switch (node->node_type())
@@ -3203,7 +3198,7 @@ std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> resolve_block_s
                 GridShape operand_grid = operand_op_model.grid_shape;
 
                 block_shape = operand_block_shape;
-                std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+                std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
 
                 if (not operand_op_model.t_stream_factor.none())
                 {
diff --git a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp b/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
index e3c3edbf..dde6f038 100644
--- a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
+++ b/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
@@ -92,19 +92,6 @@ std::optional<OpModel> get_closest_op_model_conservative(
     return closest_model;
 }
 
-bool operand_of_linked_output_node(const graphlib::Graph *graph, const graphlib::Node *node)
-{
-    for (const graphlib::Node *user_node : graph->data_users(node))
-    {
-        if (user_node->node_type() == graphlib::NodeType::kOutput and is_linked_queue(graph, user_node))
-        {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 // Optimize a solution by iteratively bumping up grids of the slowest ops, as long as that
 // improves the utilization of the epoch.
 // Conservative version which tries to stick to the same ribbon and same OP count in epoch.
@@ -174,14 +161,6 @@ EpochSolution optimize_solution_conservative(
         {
             const OpModel &source_op_model = best_solution.get_selected_op_models()[op_index];
 
-            // Mitigation for linked output nodes. We don't want to bump up the grid of the linked output node because
-            // of higher chance of op_model mismatch on OPs feeding the fake output.
-            //
-            if (operand_of_linked_output_node(graph, source_op_model.buda_op_node))
-            {
-                continue;
-            }
-
             int cycles = get_limiter_cycles(
                 source_op_model,
                 graph,
diff --git a/pybuda/csrc/graph_lib/graph.cpp b/pybuda/csrc/graph_lib/graph.cpp
index 125ade3c..33114290 100644
--- a/pybuda/csrc/graph_lib/graph.cpp
+++ b/pybuda/csrc/graph_lib/graph.cpp
@@ -228,6 +228,10 @@ std::vector<Edge> Graph::operand_data_edges(const Node *node, std::function<bool
     return operand_edges(node, [edge_filter](Edge edge) {return edge_filter(edge) and (edge.edge_type == EdgeType::kData or edge.edge_type == EdgeType::kDataLoopback);});
 }
 
+std::vector<Edge> Graph::operand_partial_datacopy_edges(const Node *node, std::function<bool(Edge)> edge_filter) const {
+    return operand_edges(node, [edge_filter](Edge edge) {return edge_filter(edge) and (edge.edge_type == graphlib::EdgeType::kPartialDataCopy);});
+}
+
 std::unordered_set<Edge> &Graph::user_edges_set(NodeId node_id) { return this->users_map_.at(node_id); }
 
 
@@ -280,6 +284,10 @@ std::vector<Edge> Graph::user_data_edges(const Node *node, std::function<bool(Ed
     return user_edges(node, [edge_filter](Edge edge) {return edge_filter(edge) and (edge.edge_type == EdgeType::kData or edge.edge_type == EdgeType::kDataLoopback);});
 }
 
+std::vector<Edge> Graph::user_partial_datacopy_edges(const Node *node, std::function<bool(Edge)> edge_filter) const {
+    return user_edges(node, [edge_filter](Edge edge) {return edge_filter(edge) and (edge.edge_type == graphlib::EdgeType::kPartialDataCopy);});
+}
+
 const std::unordered_set<Edge> &Graph::user_edges_set(const Node *node) const {
     return this->users_map_.at(node->id());
 }
diff --git a/pybuda/csrc/graph_lib/graph.hpp b/pybuda/csrc/graph_lib/graph.hpp
index fbcae519..bbcd6770 100644
--- a/pybuda/csrc/graph_lib/graph.hpp
+++ b/pybuda/csrc/graph_lib/graph.hpp
@@ -126,8 +126,12 @@ class Graph
         const Node *node, std::function<bool(Edge)> edge_filter = [](Edge) { return true; }) const;
     std::vector<Edge> operand_data_edges(
         const Node *node, std::function<bool(Edge)> edge_filter = [](Edge) { return true; }) const;
+    std::vector<Edge> operand_partial_datacopy_edges(
+        const Node *node, std::function<bool(Edge)> edge_filter = [](Edge) { return true; }) const;
     std::vector<Edge> user_data_edges(
         const Node *node, std::function<bool(Edge)> edge_filter = [](Edge) { return true; }) const;
+    std::vector<Edge> user_partial_datacopy_edges(
+        const Node *node, std::function<bool(Edge)> edge_filter = [](Edge) { return true; }) const;
 
     std::unordered_set<Edge> &user_edges_set(NodeId node_id);
     const std::unordered_set<Edge> &user_edges_set(const Node *node) const;
diff --git a/pybuda/csrc/graph_lib/utils.cpp b/pybuda/csrc/graph_lib/utils.cpp
index 954eb9ee..df0b8d05 100644
--- a/pybuda/csrc/graph_lib/utils.cpp
+++ b/pybuda/csrc/graph_lib/utils.cpp
@@ -1699,8 +1699,7 @@ std::vector<UBlockOrder> get_input_ublock_order(Graph const *graph, Node const *
 
 tt::graphlib::Node *get_input_queue_producer(Graph const *graph, tt::graphlib::InputNode const *node)
 {
-    auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-    std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_edges(node, is_partial_datacopy_edge);
+    std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_partial_datacopy_edges(node);
     auto producers = graph->data_operands(node);
 
     if (not producers.empty() and not partial_datacopy_edges.empty())
diff --git a/pybuda/csrc/passes/dataformat.cpp b/pybuda/csrc/passes/dataformat.cpp
index b26898fa..dbdeb68d 100644
--- a/pybuda/csrc/passes/dataformat.cpp
+++ b/pybuda/csrc/passes/dataformat.cpp
@@ -587,9 +587,6 @@ void fix_data_formats(graphlib::Graph *graph, bool fp32_acc_supported)
         else if (node->node_type() == graphlib::NodeType::kOutput)
         {
             auto output_op = graph->data_operands(node)[0];
-            auto is_partial_datacopy_edge = [](Edge e) {
-                return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-            };
             if (node->as<graphlib::OutputNode>()->untilize())
             {
                 if ((output_op->output_df() == DataFormat::Bfp8_b) || (output_op->output_df() == DataFormat::Bfp4_b) ||
@@ -605,7 +602,7 @@ void fix_data_formats(graphlib::Graph *graph, bool fp32_acc_supported)
                 }
             }
 
-            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
             if (not partial_datacopy_edges.empty())
             {
                 // Current queue is aliased to an existing queue. Impose constraint on write-back producer
@@ -869,10 +866,7 @@ void validate_data_formats(const graphlib::Graph *graph, const DeviceConfig& dev
         else if (node->node_type() == graphlib::NodeType::kOutput)
         {
             auto producer = graph->data_operands(node).at(0);
-            auto is_partial_datacopy_edge = [](Edge e) {
-                return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-            };
-            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+            std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
             if (not partial_datacopy_edges.empty())
             {
                 // Current queue is aliased to an existing queue. Impose constraint on write-back producer
diff --git a/pybuda/csrc/passes/insert_inverse_on_io.cpp b/pybuda/csrc/passes/insert_inverse_on_io.cpp
index c5ff7d81..ff4fd84e 100644
--- a/pybuda/csrc/passes/insert_inverse_on_io.cpp
+++ b/pybuda/csrc/passes/insert_inverse_on_io.cpp
@@ -257,13 +257,9 @@ std::pair<bool, std::unique_ptr<IOEdgeInfo>> find_commutable_output_edge(
         if ((!can_commute and op != initial_op) or (not all_forks_commute_to_output_or_inverse))
             break;
 
-
-        auto is_partial_datacopy_edge = [](graphlib::Edge e) {
-            return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-        };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(users[0], is_partial_datacopy_edge);
-
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(users[0]);
         graphlib::OutputNode *output = dynamic_cast<graphlib::OutputNode *>(users[0]);
+
         // Usually there is no point in inserting an invers on top of an output if the initial op in question is 
         // Adjacent to the output. Unless, this node forks.
         if (output and (op != initial_op or graph->user_data_edges(op).size() > 1) and partial_datacopy_edges.empty())
diff --git a/pybuda/csrc/passes/lower_concat_to_runtime_transform.cpp b/pybuda/csrc/passes/lower_concat_to_runtime_transform.cpp
index 11c85c46..c809f0df 100644
--- a/pybuda/csrc/passes/lower_concat_to_runtime_transform.cpp
+++ b/pybuda/csrc/passes/lower_concat_to_runtime_transform.cpp
@@ -23,10 +23,7 @@ void lower_concat_to_runtime_transform(graphlib::Graph *graph)
     for (graphlib::Node *output_node: graph->nodes_by_type(graphlib::NodeType::kOutput))
     {
         // Skip partial data copy edges (past-cache link between producers/consumers)
-        auto is_partial_datacopy_edge = [](graphlib::Edge e) {
-            return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-        };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(output_node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(output_node);
         if (not partial_datacopy_edges.empty())
             continue;
         
diff --git a/pybuda/csrc/passes/pad_output_buffer.cpp b/pybuda/csrc/passes/pad_output_buffer.cpp
index 9d2f4912..49d5a204 100644
--- a/pybuda/csrc/passes/pad_output_buffer.cpp
+++ b/pybuda/csrc/passes/pad_output_buffer.cpp
@@ -41,10 +41,7 @@ void pad_output_buffer(graphlib::Graph *graph, const DeviceConfig &device_config
     for (graphlib::Node *output_node: graph->nodes_by_type(graphlib::NodeType::kOutput))
     {
         // Skip partial data copy edges (past-cache link between producers/consumers)
-        auto is_partial_datacopy_edge = [](graphlib::Edge e) {
-            return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-        };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(output_node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(output_node);
         if (not partial_datacopy_edges.empty())
             continue;
 
diff --git a/pybuda/csrc/passes/pre_placer_buda_passes.cpp b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
index f851ba21..326e4642 100644
--- a/pybuda/csrc/passes/pre_placer_buda_passes.cpp
+++ b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
@@ -383,13 +383,9 @@ void insert_queues_for_op_intermediates(graphlib::Graph *graph, const std::vecto
 
 void sanitize_past_cache_ios(graphlib::Graph *graph)
 {
-    auto is_partial_datacopy_edge = [](Edge e) {
-        return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-    };
-
     for (Node *node : graph->nodes_by_type(NodeType::kOutput))
     {
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
         if (partial_datacopy_edges.size() == 0)
             continue;
         
@@ -1080,10 +1076,7 @@ void calculate_ublock_order(graphlib::Graph *graph) {
     // Set order on partial datacopy edges consumers
     for (Node * node : graph->nodes())
     {
-        auto is_partial_datacopy_edge = [](Edge e) {
-            return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-        };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_edges(node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_partial_datacopy_edges(node);
 
         if (partial_datacopy_edges.empty())
             continue;
@@ -1349,12 +1342,8 @@ void validate_buffering_queues(graphlib::Graph *graph) {
 
 
 void insert_partial_datacopy_tms(graphlib::Graph *graph) {
-
-    auto is_partial_datacopy_edge = [](Edge e) {
-        return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-    };
     for (graphlib::Node *node : graphlib::topological_sort(*graph))  {
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_partial_datacopy_edges(node);
         if (node->node_type() != graphlib::NodeType::kOutput or partial_datacopy_edges.empty()) {
             continue;
         }
diff --git a/pybuda/csrc/passes/tests/test_past_cache_ublock_order.cpp b/pybuda/csrc/passes/tests/test_past_cache_ublock_order.cpp
index 60bab386..a5b4a5df 100644
--- a/pybuda/csrc/passes/tests/test_past_cache_ublock_order.cpp
+++ b/pybuda/csrc/passes/tests/test_past_cache_ublock_order.cpp
@@ -55,10 +55,7 @@ bool check_ublock_order(graphlib::Graph *graph) {
     bool ublock_order_matches = true;
     for (Node * node : graph->nodes())
     {
-        auto is_partial_datacopy_edge = [](Edge e) {
-            return (e.edge_type == graphlib::EdgeType::kPartialDataCopy);
-        };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_edges(node, is_partial_datacopy_edge);
+        std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_partial_datacopy_edges(node);
 
         if (partial_datacopy_edges.empty())
             continue;
diff --git a/pybuda/test/benchmark/benchmark/models/t5.py b/pybuda/test/benchmark/benchmark/models/t5.py
index db21a570..61ba12c7 100644
--- a/pybuda/test/benchmark/benchmark/models/t5.py
+++ b/pybuda/test/benchmark/benchmark/models/t5.py
@@ -23,8 +23,6 @@ def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, da
     if data_type == "Fp16_b" and pybuda.detect_available_devices()[0] == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_ENABLE_DRAM_IO_BUFFER_SCALING"] = "1"
         os.environ["PYBUDA_ENABLE_INPUT_BUFFER_SCALING_FOR_NOC_READERS"] = "1"
-        # Disable DRAM BW estimates.
-        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "0"
 
     # These are about to be enabled by default.
     #

From 3a6e3b66371494fbddc5776ddc31f9496745f33e Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Tue, 18 Jun 2024 11:06:08 +0000
Subject: [PATCH 17/29] Add missing tests for Concatenate operator

(cherry picked from commit b790f0886381df35e2452b98b4008f5b904083c1)
---
 .../test/operators/nary/test_concatenate.py   | 563 ++++++++++++++++++
 pybuda/test/operators/utils/netlist_utils.py  |   2 +
 2 files changed, 565 insertions(+)
 create mode 100644 pybuda/test/operators/nary/test_concatenate.py

diff --git a/pybuda/test/operators/nary/test_concatenate.py b/pybuda/test/operators/nary/test_concatenate.py
new file mode 100644
index 00000000..3d21e4ce
--- /dev/null
+++ b/pybuda/test/operators/nary/test_concatenate.py
@@ -0,0 +1,563 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for testing of concatenate operator
+#
+#
+#
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (+)  2.1 From another op
+#       - Operator -> input
+# (+)  2.2 From tm edge
+#       - Combination: operator -> tm -> input
+#       - tm -> input
+# (+)  2.3 From DRAM queue
+#       - input_queue flag = false
+#       - Special case of From host? May it be triggered if the operator is not the first node of the network?
+#       - Can this be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# (+)  2.4 From DRAM, but prologued (constant)
+#       - Constants must be small enough to fit into L1
+#       - Verification via netlists that scenario is triggered
+#       - Input are not prologued for microbatch size = 1
+# (+)  2.5 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants. Does it make difference if tensor is big > L1
+#       - Verification via netlists that scenario is triggered???
+# (+)  2.6 From host
+#       - Input tensor as input of network -> Operator is first node in network and input_queue flag = true
+#       - Can this scenario be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# 3 Operand shapes type(s):
+# (+)  3.1 Full tensor (i.e. full expected shape)
+#       - Is 3 dims max for all ops? Ex. Conv is 3d max
+# (+)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (+)  3.3 Scalar
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (+)  4.1 Divisible by 32
+# (+)  4.2 Prime numbers
+# (+)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (+)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (+)  5.1 Output DF
+# (+)  5.2 Intermediate DF
+# (+)  5.3 Accumulation DF
+# (+)  5.4 Operand DFs
+# (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+# (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+
+
+import pytest
+
+import pybuda
+import torch
+
+from pybuda import PyBudaModule, VerifyConfig
+from pybuda.config import _get_global_compiler_config
+from pybuda.verify import TestKind, verify_module
+from test.operators.utils import netlist_utils
+
+
+# Concatenate operator doesn't work for axis is equal to 0.
+# For input shapes different from (1, n) or (n, m) the following error is raised:
+# Error message:
+#     "...
+#      pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3
+#      ..."
+# In case of shape = (n, m) the following error is raised:
+# Error message:
+#     "...
+#      AssertionError: Error during inference
+#      ..."
+# In case of shape = (1, n) the test passes!
+axises = [0]
+input_shapes = [
+                 (1, 3),            # shape0 - test passes.
+                 (5, 3),            # shape1 - test fails. Message: "AssertionError: Error during inference"
+                 (1, 3, 3),         # shape2 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
+                 (2, 3, 3),         # shape3 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
+                 (1, 3, 3, 3),      # shape4 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
+                 (1, 3, 3, 3, 3)    # shape5 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
+               ]
+@pytest.mark.xfail(reason="Concatenate operator doesn't work for axis value of 0.")
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", input_shapes)
+def test_concatenate_invalid_axis(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            output = pybuda.op.Concatenate("Concatenate0", x, y, axis=axis)
+            return output
+
+    mod = Model("test_concatenate_invalid_axis_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+# setup of axises and shapes for all tests:
+axises = [-3, -2, -1, 1, 2]
+
+def get_input_shapes(microbatch_size=1):
+                                              # Here we cover interesting combinations of input shapes:
+    return [
+            (microbatch_size, 3, 3),         # 3.1 Full tensor (i.e. full expected shape)
+            (microbatch_size, 10, 5),        # 3.1 Full tensor (i.e. full expected shape)
+            (microbatch_size, 1, 15),        # 3.2 Tensor reduce on one or more dims to 1
+            (microbatch_size, 50, 1),        # 3.2 Tensor reduce on one or more dims to 1
+            (microbatch_size, 100, 100),     # 4.3 Very large (thousands, 10s of thousands)
+            (microbatch_size, 100, 1000),    # 4.3 Very large (thousands, 10s of thousands)
+            (microbatch_size, 1, 4991),      # 4.4 Extreme ratios between height/width        - FAILING FOR 4992 and axis=[-1, 2]
+            (microbatch_size, 8191, 1),      # 4.4 Extreme ratios between height/width        - FAILING FOR 8192 and axis=[-1, 2]
+            (microbatch_size, 32, 32),       # 4.1 Divisible by 32
+            (microbatch_size, 96, 96),       # 4.1 Divisible by 32
+            (microbatch_size, 13, 97),       # 4.2 Prime numbers
+            ]
+
+
+#   2.1 From another op
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_another_operand(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            # we use Add and Subtract operators to create two operands which are inputs for the Concatenate operator
+            xx = pybuda.op.Add("Add0", x, y)
+            yy = pybuda.op.Subtract("Subtract0", x, y)
+            output = pybuda.op.Concatenate("Concatenate0", xx, yy, axis=axis)
+            return output
+        
+    mod = Model("test_concatenate_inputs_from_another_operand_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+
+#   2.2 From tm edge
+#    - Combination: operator -> tm -> input
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_tm_edge1(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            v1 = pybuda.op.Add("Add0", x, y)
+            v2 = pybuda.op.tm.Transpose("Transpose0", v1, -1, -2)
+            v3 = pybuda.op.Concatenate("Concatenate0", v2, v2, axis=axis)
+            return v3
+        
+    mod = Model("test_concatenate_inputs_from_tm_edge1_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+
+#   2.2 From tm edge
+#    - tm -> input
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_tm_edge2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            v1 = pybuda.op.tm.Transpose("Transpose0", x, -1, -2)
+            v2 = pybuda.op.tm.Transpose("Transpose1", y, -1, -2)
+            v3 = pybuda.op.Concatenate("Concatenate0", v1, v2, axis=axis)
+            return v3
+        
+    mod = Model("test_concatenate_inputs_from_tm_edge2_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+
+#   2.3 From DRAM queue
+#    - input_queue flag = false
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_dram_queue(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            return pybuda.op.Concatenate("Concatenate0", x, y, axis=axis)
+        
+    mod = Model("test_concatenate_inputs_from_dram_queue_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = False
+    if(math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
+    assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+
+
+
+#   2.4 From DRAM, but prologued (constant)
+#    - Constants must be small enough to fit into L1
+#    - Input are not prologued for microbatch size = 1
+@pytest.mark.parametrize("axis", [pytest.param(-3, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3], but pass fo")),
+                                  pytest.param(-2),
+                                  pytest.param(-1),
+                                  pytest.param(1),
+                                  pytest.param(2)
+                                  ])
+@pytest.mark.parametrize("input_shape, default_dram_params, should_prolog", [
+    pytest.param((2, 3, 3),      True, False),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 3, 3),      False, True),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 3, 3),      None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((1, 3, 3),      True, False),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - PASS
+    pytest.param((1, 3, 3),      False, True),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - PASS
+    pytest.param((1, 3, 3),      None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - PASS - but not according to documentation!
+    pytest.param((2, 10, 5),     None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 1, 15),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
+    pytest.param((2, 50, 1),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
+    pytest.param((2, 100, 100),  None, True),                                                                   # 4.3 Very large (thousands, 10s of thousands)  - FAILING FOR axis=[-3]
+    pytest.param((2, 100, 1000), None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.3 Very large (thousands, 10s of thousands)
+    pytest.param((2, 1, 4991),   None, False, marks=pytest.mark.xfail(reason="FAILING FOR for all axises")),    # 4.4 Extreme ratios between height/width
+    pytest.param((2, 1, 10000),  None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 8191, 1),   None, False, marks=pytest.mark.xfail(reason="FAILING FOR for all axises")),    # 4.4 Extreme ratios between height/width
+    pytest.param((2, 10000, 1),  None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 32, 32),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
+    pytest.param((2, 96, 96),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
+    pytest.param((2, 13, 97),    None, True),                                                                   # 4.2 Prime numbers                             - FAILING FOR axis=[-3]
+])
+def test_concatenate_inputs_from_dram_prologued(test_device, axis, input_shape, default_dram_params, should_prolog, input_params=[], math_fidelity=None):
+    
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+            def my_rand(*shape, requires_grad=False):
+                return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+
+            t = input_shape[1:]
+            self.shape_input = (1, *t)
+
+            self.add_constant("c")
+            self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
+
+
+        def forward(self, x):
+            return pybuda.op.Concatenate("Concatenate0", self.get_constant("c"), x, axis=axis)
+        
+    mod = Model("test_concatenate_inputs_from_dram_prologued_model")
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.default_dram_parameters = default_dram_params
+    compiler_cfg.input_queues_on_host = False
+    if(math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=[input_shape],
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Concatenate0")
+    if should_prolog:
+        assert d['prologue']
+    else:
+        assert not d['prologue']
+
+
+#   2.5 Const Inputs (const eval pass)
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_constants(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+     
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+            def my_rand(*shape, requires_grad=False):
+                return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+
+            self.shape_input = input_shape
+
+            self.add_constant("c1")
+            self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
+
+            self.add_constant("c2")
+            self.set_constant("c2", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
+       
+            self.inputs = [
+                pybuda.Tensor.create_from_torch(my_rand(*self.shape_input))
+            ]
+
+        def forward(self, x, y):
+            v1 = pybuda.op.Concatenate("Concatenate0", self.get_constant("c1"), self.get_constant("c2"), axis=axis)
+            # v2 and v3 consume inputs
+            v2 = pybuda.op.Add("Add0", x, y)
+            v3 = pybuda.op.Add("Add1", v1, v2)
+            return v3
+
+    mod = Model("test_concatenate_inputs_from_constants_model")
+
+    if axis % 3 == 0:
+        # TODO: check - for axis = 0 concatenate doesn't change shape, maybe this is incorrect
+        i_shape = input_shape
+    else:
+        i_shape = list(input_shape)
+        i_shape[axis] = 2 * i_shape[axis]
+        i_shape = tuple(i_shape)
+    input_shapes = tuple([i_shape for _ in range(2)])
+    
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    # Here we check there is no key with "Concatenate" in the netlist in graphs section
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+    for key in d.keys():
+        assert "Concatenate" not in key
+
+
+#   2.6 From host - case of two tensors as input
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+def test_concatenate_inputs_from_host_2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, x, y):
+            return pybuda.op.Concatenate("Concatenate0", x, y, axis=axis)
+        
+    mod = Model("test_concatenate_inputs_from_host_2_model")
+    input_shapes = tuple([input_shape for _ in range(2)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+number_of_operands = [
+                       pytest.param(3),   # all passes.
+                       pytest.param(4, marks=pytest.mark.xfail(reason="fails only for GOLDEN_WORMHOLE_BO=1")),   # fails only for GOLDEN_WORMHOLE_BO=1
+                       pytest.param(7, marks=pytest.mark.xfail(reason="fails in any case")),   
+                            # Error message:
+                            # ...
+                            # [Golden-7-input_shape6--1] - RuntimeError: 1 Nodes have no valid grids, exiting
+                            # [Golden-7-input_shape6-2] - RuntimeError: 1 Nodes have no valid grids, exiting
+                            # [Golden-7-input_shape7--2] - RuntimeError: 1 Nodes have no valid grids, exiting
+                            # [Golden-7-input_shape7-1] - RuntimeError: 1 Nodes have no valid grids, exiting
+                            # ...
+                       pytest.param(15, marks=pytest.mark.xfail(reason="fails in any case")),   
+                            # Error message:
+                            # ...
+                            # [Golden-15-input_shape6--1] - RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/balancer_utils.cpp:238: shape.ct % factor == 0
+                            # [Golden-15-input_shape6-2] - RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/balancer_utils.cpp:238: shape.ct % factor == 0
+                            # [Golden-15-input_shape7--2] - RuntimeError: 2 Nodes have no valid grids, exiting
+                            # [Golden-15-input_shape7-1] - RuntimeError: 2 Nodes have no valid grids, exiting
+                            # ...
+                     ]
+
+#   2.6 From host - case of multiple number of tensors as input
+@pytest.mark.parametrize("axis", axises)
+@pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
+@pytest.mark.parametrize("number_of_operands", number_of_operands)
+def test_concatenate_inputs_from_host_multiple_operands(test_device, axis, input_shape, number_of_operands, input_params=[], math_fidelity=None):
+
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, *x):
+            return pybuda.op.Concatenate("Concatenate0", *x, axis=axis)
+        
+    mod = Model("test_concatenate_inputs_from_host_multiple_operands")
+    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
+
+    if(math_fidelity is not None):
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+
+# Operand Data Format and Math Fidelity
+
+# First, we will test only by fixing one axis and one input shape.
+axis = 1
+def get_single_shape(microbatch_size=1):
+    return (microbatch_size, 3, 3)        # Full tensor, small size
+
+# We will not test all combinations of Data Format and Math Fidelity
+# because it would be too much tests. 
+#   1. First we will choose Data Format to be Float16_b and test all Math Fidelity values
+#   2. Then we will set Math Fidelity to HiFi4 and test all Data Formats. 
+
+### 1. ####################################################################################
+
+#   5.4 Operand DFs
+verify_input_params=[ 
+                        {"dev_data_format": pybuda.DataFormat.Float16_b},
+                    ]
+
+#  6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+compiler_math_fidelity = [
+                            pybuda.MathFidelity.LoFi,
+                            pybuda.MathFidelity.HiFi2,
+                            pybuda.MathFidelity.HiFi3,
+                            pybuda.MathFidelity.HiFi4,
+                         ]
+
+
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_concatenate_mf_inputs_from_another_operand(test_device, math_fidelity):
+    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+
+
+### 2. ####################################################################################
+
+#   5.4 Operand DFs
+verify_input_params=[
+                        {"dev_data_format": pybuda.DataFormat.Bfp2},
+                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
+                        {"dev_data_format": pybuda.DataFormat.Bfp4},
+                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
+                        {"dev_data_format": pybuda.DataFormat.Bfp8},
+                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
+                        {"dev_data_format": pybuda.DataFormat.Float16},  
+                        {"dev_data_format": pybuda.DataFormat.Float16_b},
+                        {"dev_data_format": pybuda.DataFormat.Float32},
+                        {"dev_data_format": pybuda.DataFormat.Int8},
+                        {"dev_data_format": pybuda.DataFormat.Lf8},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
+                        {"dev_data_format": pybuda.DataFormat.UInt16},
+                    ]
+
+#  6. Math fidelity
+compiler_math_fidelity = pybuda.MathFidelity.HiFi4
+
+
+@pytest.mark.parametrize("input_params", verify_input_params)
+def test_concatenate_df_inputs_from_another_operand(test_device, input_params):
+    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
diff --git a/pybuda/test/operators/utils/netlist_utils.py b/pybuda/test/operators/utils/netlist_utils.py
index 2fb0658e..92024fd4 100644
--- a/pybuda/test/operators/utils/netlist_utils.py
+++ b/pybuda/test/operators/utils/netlist_utils.py
@@ -1,5 +1,7 @@
 import yaml
 
+# TODO: support multiple architectures via wildcards. It will be important for nightly job for operators.
+# Currently, it works only when specifying GOLDEN_WORMHOLE_B0=1 on test run.
 def read_netlist_value(file_path, key_path):
     """
     Reads a netlist value from a YAML file based on the given key path.

From b867b41cecefc3a3baa1b5ec7c69965098b4a31e Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 18 Jun 2024 14:47:39 +0000
Subject: [PATCH 18/29] Add more operators to PyBuda repository

(cherry picked from commit 3f9e39aa062043b5af15122fc335d13f6322eb4e)
---
 pybuda/pybuda/op_repo/pybuda_operators.py     |  48 ++++++-
 pybuda/test/random/rgg/__init__.py            |   2 +
 pybuda/test/random/rgg/algorithms.py          |  11 +-
 pybuda/test/random/rgg/config.py              |   2 +-
 pybuda/test/random/rgg/datatypes.py           |   2 +-
 .../random/rgg/pybuda/generated_model.jinja2  |   6 +-
 .../random/rgg/pytorch/generated_model.jinja2 |   5 +-
 pybuda/test/random/rgg/utils.py               |  23 +++-
 pybuda/test/random/test_graphs.py             | 129 +++++++++++++++++-
 9 files changed, 202 insertions(+), 26 deletions(-)

diff --git a/pybuda/pybuda/op_repo/pybuda_operators.py b/pybuda/pybuda/op_repo/pybuda_operators.py
index 2da088f0..d0e1d3d6 100644
--- a/pybuda/pybuda/op_repo/pybuda_operators.py
+++ b/pybuda/pybuda/op_repo/pybuda_operators.py
@@ -12,18 +12,56 @@
 
 # TODO describe operand and shapes
 _OPERATORS = [
+
+    # Unary operators
+    OperatorDefinition("exp", "pybuda.op.Exp", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("reciprocal", "pybuda.op.Reciprocal", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("buffer", "pybuda.op.Buffer", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("sqrt", "pybuda.op.Sqrt", 1, calc_input_shapes=same_input_shapes),
     OperatorDefinition("relu", "pybuda.op.Relu", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("leaky_relu", "pybuda.op.LeakyRelu", 1, forward_params=[
+        OperatorParamNumber("alpha", float, 0, 100),
+    ], calc_input_shapes=same_input_shapes),
+    OperatorDefinition("nop", "pybuda.op.Identity", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("gelu", "pybuda.op.Gelu", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("log", "pybuda.op.Log", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("sigmoid", "pybuda.op.Sigmoid", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("clip", "pybuda.op.Clip", 1, forward_params=[
+        OperatorParamNumber("min", float, 0, 100),
+        OperatorParamNumber("max", float, 0, 100),
+    ], calc_input_shapes=same_input_shapes),
+    OperatorDefinition("sine", "pybuda.op.Sine", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("cosine", "pybuda.op.Cosine", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("abs", "pybuda.op.Abs", 1, calc_input_shapes=same_input_shapes),
     OperatorDefinition("tanh", "pybuda.op.Tanh", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("exp", "pybuda.op.Exp", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("cumsum", "pybuda.op.CumSum", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("argmax", "pybuda.op.Argmax", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("logical_not", "pybuda.op.LogicalNot", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("dropout", "pybuda.op.Dropout", 1, calc_input_shapes=same_input_shapes),
     OperatorDefinition("pow", "pybuda.op.Pow", 1, forward_params=[
-        # float exponent is currently not supported due to issue #2592
-        # OperatorParamNumber("exponent", float, 0, 100),
-        OperatorParamNumber("exponent", int, 0, 100),
+        OperatorParamNumber("exponent", float, 0, 100),
     ], calc_input_shapes=same_input_shapes),
+    OperatorDefinition("tilizer", "pybuda.op.Tilize", 1, calc_input_shapes=same_input_shapes),
+
+    # Binary operators
     OperatorDefinition("add", "pybuda.op.Add", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("divide", "pybuda.op.Divide", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("subtract", "pybuda.op.Subtract", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("multiply", "pybuda.op.Multiply", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("maximum", "pybuda.op.Max", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("minimum", "pybuda.op.Min", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("heaviside", "pybuda.op.Heaviside", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("binary_stack", "pybuda.op.BinaryStack", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("power", "pybuda.op.Power", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("greater", "pybuda.op.Greater", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("greater_equal", "pybuda.op.GreaterEqual", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("less", "pybuda.op.Less", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("less_equal", "pybuda.op.LessEqual", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("equal", "pybuda.op.Equal", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("not_equal", "pybuda.op.NotEqual", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("logical_and", "pybuda.op.LogicalAnd", 2, calc_input_shapes=matmul_inputs),
 
     OperatorDefinition("matmul", "pybuda.op.Matmul", 2, calc_input_shapes=matmul_inputs),
-    OperatorDefinition("eltwise", "pybuda.op.Add", 2, calc_input_shapes=same_input_shapes),
 ]
 
 
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index f6c60122..7a555765 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -8,6 +8,7 @@
 from .datatypes import RandomizerTestContext
 from .config import get_randomizer_config_default
 from .utils import StrUtils, GraphUtils
+from .utils import DebugUtils
 from .base import Framework, GraphBuilder, ModelBuilder
 from .base import RandomizerRunner, RandomizerCodeGenerator, process_test
 from .frameworks import Frameworks
@@ -26,6 +27,7 @@
     "get_randomizer_config_default",
     "StrUtils",
     "GraphUtils",
+    "DebugUtils",
     "Framework",
     "GraphBuilder",
     "ModelBuilder",
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 20f698a6..6dee31bd 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -121,19 +121,10 @@ def prepare_graph(cls, graph: RandomizerGraph, rng_params: random.Random):
 class RandomGraphAlgorithm(GraphBuilder):
     '''Implementation of the random graph building algorithm'''
 
-    SKIP_OPERATORS = (
-        "sqrt",  # skip because it's failing for negative values
-        # "linear",
-        "conv2d",  # skip until calc_input_shapes is properly implemented
-    )
-
     def __init__(self, framework: Framework, randomizer_config):
         super(RandomGraphAlgorithm, self).__init__(randomizer_config)
         self.framework = framework
-        self.operators = [
-            op for op in framework.operator_repository.operators
-            if op.name not in self.SKIP_OPERATORS
-        ]
+        self.operators = framework.operator_repository.operators
 
     def _get_random_operator(self, rng):
         return rng.choice(self.operators)
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index f1d1d8b7..3941e2fa 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -15,10 +15,10 @@ def get_randomizer_config_default():
     randomizer_config = RandomizerConfig (
         print_graph = False,
         print_code = True,
-        # debug_forward = True,
         run_test = True,
         save_tests = True,
         # build_model_from_code = False,
+        debug_shapes = False,
         verify_shapes = False,
         # TODO ranges
         # dim_min=int(os.environ.get("MIN_DIM", 3)),
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index b66924d9..1b120534 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -78,11 +78,11 @@ class RandomizerGraph:
 class RandomizerConfig:
     print_graph: bool = True
     print_code: bool = False
-    # debug_forward: bool = True  # TODO remove obsoleted
     run_test: bool = True
     test_dir:str = "pybuda/test/random_tests"
     save_tests: bool = False
     # build_model_from_code: bool = False  # TODO remove obsoleted
+    debug_shapes: bool = False,
     verify_shapes: bool = False,
     dim_min: int = 3
     dim_max: int = 4
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index 1575046c..5b339c4f 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -4,6 +4,7 @@ import pybuda
 import pytest
 from pybuda.verify import verify_module, VerifyConfig
 {% endif %}
+from test.random.rgg import DebugUtils
 from pybuda import PyBudaModule, Tensor
 
 {# TODO replace empty new lines with spaces to keep formatting in pipeline #}
@@ -25,10 +26,11 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
         {% for node in graph.nodes %}
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
-        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if node.operator.is_layer() %}
+        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
+        print(f"{{ node.layer_name() }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer() %}
         {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
         {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name() }}', {{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
-        assert {{ node.out_value }}.shape == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
+        assert {{ node.out_value }}.shape.dims == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
 
         return v
 {% if test_format %}
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index 1f7319d6..7e556411 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -4,6 +4,8 @@ import pybuda
 import pytest
 from pybuda.verify import verify_module, VerifyConfig
 {% endif %}
+from test.random.rgg import DebugUtils
+
 {# TODO replace empty new lines with spaces to keep formatting in pipeline #}
 class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
     # graph_builder: {{ graph_builder_name }}
@@ -22,7 +24,8 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
         {% for node in graph.nodes %}
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
-        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if node.operator.is_layer() %}
+        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
+        print(f"{{ node.layer_name() }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer() %}
         {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
         {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}({{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
         assert {{ node.out_value }}.shape == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index de978c33..af2aee84 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -5,11 +5,15 @@
 
 
 import random
-from typing import List, Dict
+from typing import Callable, List, Dict
 from dataclasses import asdict
+from loguru import logger
 import re
 import yaml
 
+import torch
+import pybuda
+
 from pybuda.op_repo import OperatorParam, OperatorDefinition, OperatorParamNumber
 
 from .datatypes import TensorShape
@@ -65,7 +69,7 @@ def random_value_for_number_param(cls, param: OperatorParamNumber, rng_params: r
         if param.type == float:
             return rng_params.uniform(param.min_value, param.max_value)
         elif param.type == int:
-            return rng_params.randint(param.min_value, param.max_value + 1)
+            return rng_params.randint(param.min_value, param.max_value)
         else:
             raise ValueError(f"Unsupported type {param.type}")
 
@@ -165,3 +169,18 @@ def get_open_nodes_with_input_shape(cls, nodes: List[RandomizerNode], input_shap
     @classmethod
     def calc_input_shapes(cls, node: RandomizerNode, rng_shape: random.Random) -> List[TensorShape]:
         return node.operator.calc_input_shapes(node.operator, node.output_shape, rng_shape)
+
+
+class DebugUtils:
+
+    @classmethod
+    def format_tensors(cls, tensors: List[pybuda.Tensor]):
+        if isinstance(tensors[0], pybuda.Tensor):
+            format_tensor: Callable[[pybuda.Tensor], str] = lambda t: f'{t.data_format}:{t.shape}'
+        elif isinstance(tensors[0], torch.Tensor):
+            format_tensor: Callable[[pybuda.Tensor], str] = lambda t: f'{t.type()}:{t.shape}'
+        return [format_tensor(t) for t in tensors]
+    
+    @classmethod
+    def debug_inputs(cls, inputs: List[pybuda.Tensor]):
+        logger.info(f"inputs: {cls.format_tensors(inputs)}")
diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index cd2291a0..13eee3c5 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -3,19 +3,140 @@
 # SPDX-License-Identifier: Apache-2.0
 # Test random graph configurations by utilizing Random Graph Generator Algorithm and targeting PyBuda and PyTorch frameworks
 
+from enum import Enum
 import pytest
 
+from typing import Tuple
+from copy import copy
+
+from pybuda.op_repo import OperatorParamNumber
+from pybuda.op_repo import OperatorDefinition
+
+from test.random.rgg import Framework
 from test.random.rgg import Frameworks
 from test.random.rgg import RandomGraphAlgorithm
+from test.random.rgg import RandomizerConfig
 from test.random.rgg import process_test
 
 
-# @pytest.mark.parametrize("framework", [framework.value for framework in Frameworks])
+class FrameworkTestUtils:
+
+    @staticmethod
+    def copy_framework(framework: Framework, skip_operators: Tuple[str] = []):
+        framework0 = framework
+        framework = copy(framework)
+        framework.operator_repository = copy(framework.operator_repository)
+        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name not in skip_operators]
+        assert len(framework.operator_repository.operators) + len(skip_operators) == len(framework0.operator_repository.operators), "Operators count should match after skipping operators"
+        return framework
+
+    @staticmethod
+    def copy_operator(framework: Framework, operator_name: str) -> OperatorDefinition:
+        operators = framework.operator_repository.operators
+
+        i, operator = next(((i, operator) for i, operator in enumerate(operators) if operator.name == operator_name), (None, None))
+        if not operator:
+            return None
+
+        operator = copy(operator)
+        operators[i] = operator
+        return operator
+
+
+class FrameworksHealthy(Enum):
+    ''' Adjust repositories to test healthy operators '''
+
+    @staticmethod
+    def healty_pybuda():
+        SKIP_OPERATORS = (
+            # Unary operators
+            "exp",  # pcc?
+            "sqrt",  # skip because it's failing for negative values
+            "cumsum",  # bug
+            "argmax",  # shape calc is wrong
+            "logical_not",  # bug
+            "dropout",  # pcc?
+            "tilizer",  # bug
+
+            # Binary operators
+            "divide",  # bug
+            "binary_stack",  # bug
+            "power",  # occasionally fails
+            "logical_and",  # bug
+        )
+
+        framework = FrameworkTestUtils.copy_framework(Frameworks.PYBUDA.value, SKIP_OPERATORS)
+
+        pow_operator = FrameworkTestUtils.copy_operator(framework, "pow")
+        if pow_operator:
+            pow_operator.forward_params = [
+                # float exponent is currently not supported due to issue #2592
+                # OperatorParamNumber("exponent", float, 0, 100),
+                # OperatorParamNumber("exponent", int, 0, 100),
+                OperatorParamNumber("exponent", int, 0, 4),  # pcc for higher numbers fails
+            ]
+
+        return framework
+
+    @staticmethod
+    def healty_pytorch():
+        SKIP_OPERATORS = (
+            "sqrt",  # skip because it's failing for negative values
+            # "linear",
+            "conv2d",  # skip until calc_input_shapes is properly implemented
+        )
+
+        framework = FrameworkTestUtils.copy_framework(Frameworks.PYTORCH.value, SKIP_OPERATORS)
+
+        return framework
+    
+    PYBUDA = healty_pybuda()
+    PYTORCH = healty_pytorch()
+
+
 @pytest.mark.parametrize("framework", [
-    Frameworks.PYBUDA.value,
-    Frameworks.PYTORCH.value,
+    FrameworksHealthy.PYBUDA.value,
 ])
-def test_random_graph_algorithm(test_index, random_seeds, test_device, randomizer_config, framework):
+def test_random_graph_algorithm_pybuda(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
+    # adjust randomizer_config
+    randomizer_config = copy(randomizer_config)
+    # randomizer_config.debug_shapes = True
+    # randomizer_config.verify_shapes = True
+    randomizer_config.dim_min = 3
+    randomizer_config.dim_max = 4
+    randomizer_config.op_size_per_dim_min = 4
+    # randomizer_config.op_size_per_dim_min = 16
+    randomizer_config.op_size_per_dim_max = 8
+    # randomizer_config.op_size_per_dim_max = 64
+    # randomizer_config.op_size_per_dim_max = 256
+    randomizer_config.microbatch_size_min = 1
+    randomizer_config.microbatch_size_max = 8
+    randomizer_config.num_of_nodes = 10
+
+    # TODO random_seed instead of random_seeds
+    random_seed = random_seeds[test_index]
+    process_test(test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+
+
+@pytest.mark.parametrize("framework", [
+    FrameworksHealthy.PYTORCH.value,
+])
+def test_random_graph_algorithm_pytorch(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
+    # adjust randomizer_config
+    randomizer_config = copy(randomizer_config)
+    # randomizer_config.debug_shapes = True
+    # randomizer_config.verify_shapes = True
+    randomizer_config.dim_min = 4
+    randomizer_config.dim_max = 4
+    randomizer_config.op_size_per_dim_min = 4
+    # randomizer_config.op_size_per_dim_min = 16
+    randomizer_config.op_size_per_dim_max = 8
+    # randomizer_config.op_size_per_dim_max = 64
+    # randomizer_config.op_size_per_dim_max = 256
+    randomizer_config.microbatch_size_min = 1
+    randomizer_config.microbatch_size_max = 8
+    randomizer_config.num_of_nodes = 5
+
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
     process_test(test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)

From ebf480be3f309efcfcd33cbce26ff84a36183e6e Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan <akannan@tenstorrent.com>
Date: Wed, 19 Jun 2024 12:08:11 +0000
Subject: [PATCH 19/29] Fix pybuda n300 failures

(cherry picked from commit b65ba79b0b09b86ef88d108b12b10aa9ab8ab0d0)
---
 .../model_demos/high_prio/cnn/onnx/test_perceiverio.py     | 7 ++++++-
 pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py | 4 +++-
 .../test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py  | 2 ++
 .../model_demos/high_prio/nlp/pytorch/test_whisper_1.py    | 1 +
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
index 54ff5f87..276a3d25 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
@@ -42,10 +42,14 @@ def test_perceiver_for_image_classification_onnx(test_device, model_name):
     compiler_cfg.enable_auto_fusing = False
     verify_enabled = True
 
+    pcc_value = 0.96
     if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
 
         if model_name == "deepmind/vision-perceiver-learned":
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{105*1024}"
+            compiler_cfg.balancer_op_override("add_63", "t_stream_shape", (1, 2))
+            if test_device.devtype == pybuda.BackendType.Silicon:
+                pcc_value = 0.95
 
         elif model_name == "deepmind/vision-perceiver-conv":
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
@@ -58,6 +62,7 @@ def test_perceiver_for_image_classification_onnx(test_device, model_name):
 
         elif model_name == "deepmind/vision-perceiver-fourier":
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+            compiler_cfg.balancer_op_override("add_58", "t_stream_shape", (1, 2))
 
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
 
@@ -107,6 +112,6 @@ def test_perceiver_for_image_classification_onnx(test_device, model_name):
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
             enabled=verify_enabled,  # pcc drops in silicon devicetype
-            pcc=0.96,
+            pcc=pcc_value,
         ),
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
index 9e80801d..b684c769 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -126,7 +126,7 @@ def test_yolo_v5_480x480_onnx(test_device, variant):
         os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
         os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
-        if variant == "yolov5m":
+        if variant in ("yolov5m", "yolov5s"):
             compiler_cfg.balancer_op_override(
                 "concatenate_19.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
                 "grid_shape",
@@ -227,6 +227,8 @@ def test_yolo_v5_640x640_onnx(test_device, variant):
                 "concatenate_478.dc.concatenate.7", "grid_shape", (1, 1)
             )
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{150*1024}"
+            compiler_cfg.enable_auto_fusing = False
+
 
     elif test_device.arch == BackendDevice.Grayskull:
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
index 12e23245..9e802e52 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
@@ -99,6 +99,7 @@ def test_yolox_pytorch(variant, test_device):
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_1897.dc.sparse_matmul.11.lc2")
 
             elif variant == "yolox_darknet":
                 os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "53248"
@@ -109,6 +110,7 @@ def test_yolox_pytorch(variant, test_device):
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", "t_stream_shape", (5, 1))
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_2264.dc.sparse_matmul.11.lc2")
 
     # prepare model
     weight_name = f"{variant}.pth"
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
index cc164c58..171db204 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
@@ -115,6 +115,7 @@ def test_whisper_enc_dec(test_device, variant):
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "0"
             os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
             compiler_cfg.enable_auto_fusing = False
+            compiler_cfg.place_on_new_epoch("matmul_2818")
 
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.enable_auto_fusing = False

From 245243d0ca91cbb2c0425001cb5f0d2af3840109 Mon Sep 17 00:00:00 2001
From: Nikola Obradovic <nobradovic@tenstorrent.com>
Date: Wed, 19 Jun 2024 08:15:56 +0000
Subject: [PATCH 20/29] [Balancer] Migrate policy MinimizeGrid to
 PolicyManager.

(cherry picked from commit a3738fa2e5ddd11bd05cd536810747d2899fed13)
---
 README.debug.md                               |  1 +
 pybuda/csrc/balancer/policies/policies.cpp    |  3 +-
 .../policies/policy_minimize_grid.cpp         | 66 +++++++++++--------
 3 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/README.debug.md b/README.debug.md
index 05f5ec7f..80637dc0 100644
--- a/README.debug.md
+++ b/README.debug.md
@@ -109,6 +109,7 @@
  * PYBUDA\_RIBBON2\_CALCULATE\_TARGET\_CYCLES: Calculate target cycles for every epoch within Ribbon2 balancing policy. (default: 0/False)
  * PYBUDA\_RIBBON2\_CALCULATE\_TARGET\_CYCLES\_APPLY\_FILTERING: Apply filtering on GS search space while calculating dynamic cycles per epoch within Ribbon2 balancing policy. (default: 0/False)
  * PYBUDA\_RIBBON\_LEGACY: Use legacy Ribbon balancing policy. (default: 0/False)
+ * PYBUDA\_MAXIMIZE\_GRID: Reverse logic of MinimizeGrid policy. Maximize grid size for all ops. (default: 0/False)
  * PYBUDA\_ENABLE\_HOST\_INPUT\_NOP\_BUFFERING: Enable nop buffering of input host read. (default: 0/False)
  * PYBUDA\_AUTO\_RECOMPILE: Triggers handling of backend compile error and recompiles the model. (default: 1/True)
  * PYBUDA\_AUTO\_RECOMPILE\_TARGET\_CYCLES: Enables adjustment of target cycles during recompile if no errors from backend have been previously handled. Requires PYBUDA\_AUTO\_RECOMPILE to be enabled to work. (default: 0/False)
diff --git a/pybuda/csrc/balancer/policies/policies.cpp b/pybuda/csrc/balancer/policies/policies.cpp
index 2a40ea27..7a612879 100644
--- a/pybuda/csrc/balancer/policies/policies.cpp
+++ b/pybuda/csrc/balancer/policies/policies.cpp
@@ -40,6 +40,7 @@ BalancerPolicySolution run_policy(
         }
         case PolicyType::MinimizeGrid:
         {
+            TT_ASSERT(config.use_interactive_placer);
             balancer_policy_solution = run_policy_minimize_grid(graph, config, graph_solver);
             break;
         }
@@ -126,9 +127,9 @@ bool can_use_interactive_placer(PolicyType policy_type)
     switch (policy_type)
     {
         case PolicyType::MaximizeTMinimizeGrid:
-        case PolicyType::MinimizeGrid:
         case PolicyType::CNN: return false;
 
+        case PolicyType::MinimizeGrid:
         case PolicyType::Random:
         case PolicyType::NLP:
         case PolicyType::Ribbon: 
diff --git a/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp b/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp
index bf00916c..5cf46e46 100644
--- a/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp
+++ b/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp
@@ -2,42 +2,52 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "balancer/policies/policy_minimize_grid.hpp"
-
-#include "balancer/balancer.hpp"
-#include "utils/logger.hpp"
+#include "balancer/policies/policy_manager.hpp"
+#include "balancer/policies/policy_utils.hpp"
 
 using Graph = tt::graphlib::Graph;
 using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
 
-namespace tt::balancer {
-BalancerPolicySolution run_policy_minimize_grid(Graph const* graph, BalancerConfig const&, legalizer::GraphSolver& graph_solver)
+namespace tt::balancer
+{
+BalancerPolicySolution run_policy_minimize_grid(
+    Graph const* graph, BalancerConfig const& config, legalizer::GraphSolver& graph_solver)
 {
-    for (Node* node : tt::graphlib::topological_sort(*graph)) {
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        auto legal_op_models = graph_solver.at(node);
-        std::vector<OpModel> op_models(legal_op_models.begin(), legal_op_models.end());
-        std::sort(
-            op_models.begin(),
-            op_models.end(),
-            [](OpModel const& a, OpModel const& b) -> bool
+    PolicyManager policy_manager(graph, config, graph_solver);
+    bool epoch_completed = false;
+    bool maximize_grid = env_as<bool>("PYBUDA_MAXIMIZE_GRID", false);
+    if (maximize_grid)
+    {
+        policy_manager.invalidate_suboptimal_op_models(legalizer::MatmulSparseDenseGridPairing);
+    }
+
+    // Pick OpModel for each node.
+    //
+    while (const Node* node = policy_manager.get_next_op())
+    {
+        auto legal_op_models = policy_manager.at(node);
+        const OpModel* target_grid_op_model = &(*legal_op_models.begin());
+
+        for (const OpModel& op_model : legal_op_models)
+        {
+            if ((!maximize_grid and op_model.grid_shape.volume() < target_grid_op_model->grid_shape.volume()) or
+                (maximize_grid and op_model.grid_shape.volume() > target_grid_op_model->grid_shape.volume()))
             {
-                int perimeter_a = a.grid_shape.r + a.grid_shape.c;
-                int perimeter_b = b.grid_shape.r + b.grid_shape.c;
-                if (perimeter_a == perimeter_b)
-                    return a.grid_shape.r < b.grid_shape.r;
-                return perimeter_a < perimeter_b;
-            });
-        graph_solver.set(node, op_models.front());
-        log_debug(LogBalancer, "Selected minimum grid for node: {}", node->name());
-        log_debug(LogBalancer, "  {} {}", op_models.front().grid_shape, op_models.front().t_stream_factor);
+                target_grid_op_model = &op_model;
+            }
+        }
+
+        std::tie(std::ignore, epoch_completed, std::ignore) = policy_manager.commit_op(*target_grid_op_model);
+
+        // If we're done with the epoch, finish it.
+        //
+        if (epoch_completed)
+        {
+            policy_manager.finish_current_epoch();
+        }
     }
 
-    return BalancerPolicySolution(graph_solver.finish());
+    return policy_manager.commit_solution();
 }
 
 }  // namespace tt::balancer

From f2aed91c76d3cff1e94814c3e0445a170ea37071 Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <sdjordjevic@tenstorrent.com>
Date: Thu, 20 Jun 2024 09:10:47 +0000
Subject: [PATCH 21/29] Adding few more exception rules in python script

(cherry picked from commit 38f08c90f02ee3db9438d0903d5260ea54fa4460)
---
 pybuda/pybuda/compile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/pybuda/compile.py b/pybuda/pybuda/compile.py
index 993d106c..7f865067 100644
--- a/pybuda/pybuda/compile.py
+++ b/pybuda/pybuda/compile.py
@@ -508,7 +508,7 @@ def init_compile(context: CompileContext) -> CompileDepth:
     ci.initialize_output_build_directory(context.backend_output_directory)
 
     device_cfg = dev.get_device_config(compiler_cfg=compiler_cfg)
-    logger.info("Device architecutre: {}", device_cfg.arch_name)
+    logger.info("Device architecture: {}", device_cfg.arch_name)
     logger.info("Device grid size: r = {}, c = {}", device_cfg.grid_size.r, device_cfg.grid_size.c)
 
     # Set global cluster descriptor file path if not provided by user (it was obtained from backend when getting device config)

From 8d5363e9e0ae7e37f61c540d9a0fe06a1c4e8bdc Mon Sep 17 00:00:00 2001
From: Predrag Ilkic <pilkic@tenstorrent.com>
Date: Mon, 10 Jun 2024 15:01:37 +0200
Subject: [PATCH 22/29] [fork-join] fix merge queue and nop instructions

The case when a queue instruction is merged with a NOP instruction
was not handled properly. This case can be hit when GS cuts the graph
somewhere on a fork-join path, so we need to buffer the fork-join
with queues.

Let's say that the previous instruction added NOP buffer_0_1_2
between nodes A and B. In the next iteration of epoch balancing,
GS cuts the graph somewhere on that fork-join and we need to
add a buffering queue on the path between A and B.

The new instruction will have nodes A and buffer_0_1_2 as src/dest pair.
So, to properly merge these instructions, we need to create a new
queue instruction with nodes A and B as src/dest pair, and use it
as a replacement for the old (NOP) instruction.

(cherry picked from commit bfd97cdffd9c5d10f2327e230baeaaaea577f0be)
---
 pybuda/csrc/passes/fork_join.cpp            | 18 ++++++++++++++----
 pybuda/csrc/passes/tests/test_fork_join.cpp |  7 +++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/pybuda/csrc/passes/fork_join.cpp b/pybuda/csrc/passes/fork_join.cpp
index ecf03ffe..8327d6d9 100644
--- a/pybuda/csrc/passes/fork_join.cpp
+++ b/pybuda/csrc/passes/fork_join.cpp
@@ -2025,8 +2025,13 @@ std::shared_ptr<InsertionInstruction> merge_instructions(
 
     if (instr->instr_type == InstructionType::QueueInstruction)
     {
-        // Return the new Queue instruction (queues have precedence over nops).
-        return instr;
+        // Return the new queue instruction (queues have precedence over nops),
+        // but update its input_id and dest name to match the previous instruction.
+        std::shared_ptr<QueueInsertionInstruction> merged_instr =
+            std::make_shared<QueueInsertionInstruction>(*static_cast<QueueInsertionInstruction *>(instr.get()));
+        merged_instr->dest = prev_instr->dest;
+        merged_instr->input_id = prev_instr->input_id;
+        return merged_instr;
     }
 
     TT_ASSERT(instr->instr_type == InstructionType::NopInstruction);
@@ -2096,7 +2101,10 @@ InsertionInstructionMap merge_with_prev_instr(
             log_trace(
                 LogGraphCompiler,
                 "Found an existing instruction in prev_instructions with the same key as the new one!");
-            combined_instructions[key] = merge_instructions(combined_instructions[key], instr);
+
+            auto merged_instr = merge_instructions(combined_instructions[key], instr);
+            TT_ASSERT(key == merged_instr->unique_id(), "Unique id of merged instruction should be the same as the one of the previous instruction.");
+            combined_instructions[key] = merged_instr;
         }
         else
         {
@@ -2136,7 +2144,9 @@ InsertionInstructionMap merge_with_prev_instr(
                 if (combined_instructions.count(id) != 0)
                 {
                     log_trace(LogGraphCompiler, "Merging with previous instruction: {}", combined_instructions[id]);
-                    combined_instructions[id] = merge_instructions(combined_instructions[id], instr);
+                    auto merged_instr = merge_instructions(combined_instructions[id], instr);
+                    TT_ASSERT(id == merged_instr->unique_id(), "Unique id of merged instruction should be the same as the one of the previous instruction.");
+                    combined_instructions[id] = merged_instr;
                     continue;
                 }
 
diff --git a/pybuda/csrc/passes/tests/test_fork_join.cpp b/pybuda/csrc/passes/tests/test_fork_join.cpp
index 3d8e8142..1be0d920 100644
--- a/pybuda/csrc/passes/tests/test_fork_join.cpp
+++ b/pybuda/csrc/passes/tests/test_fork_join.cpp
@@ -203,6 +203,9 @@ TEST_F(SimpleForkJoin, TestRecoverOriginalInstruction)
     EXPECT_EQ(combined_instructions.size(), 1) << "Expected only one instruction in resulting map. Queue and the NOP should have been merged.";
     EXPECT_EQ(combined_instructions.begin()->second->instr_type, InstructionType::QueueInstruction);
 
+    // Merged instruction should have the same unique_id as the original NOP instruction.
+    EXPECT_EQ(combined_instructions.begin()->second->unique_id(), nop_instruction->unique_id());
+
     // Check the same with NOP instruction.
     auto nop_instruction_2 = create_instruction<NopInsertionInstruction>(src_name, buffer_name, edge.consumer_input_port_id, edge.producer_output_port_id, false, true);
     new_instructions.clear();
@@ -212,6 +215,10 @@ TEST_F(SimpleForkJoin, TestRecoverOriginalInstruction)
     EXPECT_EQ(combined_instructions.size(), 1) << "Expected only one instruction in resulting map. NOPs should have been merged.";
     EXPECT_EQ(combined_instructions.begin()->second->instr_type, InstructionType::NopInstruction);
     auto resulting_instruction = static_cast<NopInsertionInstruction*>(combined_instructions.begin()->second.get());
+
+    // Merged instruction should have the same unique_id as the original NOP instruction.
+    // Additionally, the nop count should be updated.
+    EXPECT_EQ(resulting_instruction->unique_id(), nop_instruction->unique_id());
     EXPECT_EQ(resulting_instruction->nop_count, nop_instruction->nop_count + nop_instruction_2->nop_count);
 }
 

From a6d60938b0703fb55a0ada3b364cee4635192085 Mon Sep 17 00:00:00 2001
From: Predrag Ilkic <pilkic@tenstorrent.com>
Date: Wed, 19 Jun 2024 12:32:48 +0200
Subject: [PATCH 23/29] [test-cleanup] removing legacy ribbon flag

Yolov5 tests were failing due to an issue with merging
of queue instructions in fork-join buffering logic for
Ribbon.

Removing legacy ribbon flag now that the issue is fixed.

(cherry picked from commit dae7cd158f150b634ae491e80a6877abebb3cf9b)
---
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index e11c1966..9033753d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -214,7 +214,6 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
     if test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_PAD_SPARSE_MM"] = "{113:128}"
         if size == "x":
-            os.environ["PYBUDA_RIBBON_LEGACY"] = "1"
             os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
             os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
             os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
@@ -226,7 +225,6 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
             compiler_cfg.balancer_op_override("concatenate_26.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6,1))
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{32*1024}"
         elif size == "n":
-            os.environ["PYBUDA_RIBBON_LEGACY"] = "1"
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{16*1024}"
         else:
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{16*1024}"

From b0afaa8e3ffcd7e3b30ab27e013631aa78ee47b2 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Thu, 20 Jun 2024 10:37:16 +0000
Subject: [PATCH 24/29] Merge community changes, fix spelling and adding main
 guard

(cherry picked from commit 6e9a3669d96feb6b6b9709b176f6773bf39cc1cd)
---
 docs/public/developer.rst    | 20 ++++----
 docs/public/installation.rst |  2 +-
 docs/public/terminology.rst  |  4 +-
 docs/public/user_guide.rst   | 88 +++++++++++++++++++-----------------
 4 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/docs/public/developer.rst b/docs/public/developer.rst
index 091fb723..c0b2b928 100644
--- a/docs/public/developer.rst
+++ b/docs/public/developer.rst
@@ -125,7 +125,7 @@ User Visible Constants
 ++++++++++++++++++++++
 
 Constant registers are implemented as objects which can be referenced
-whereever a vector can be used.
+wherever a vector can be used.
 
   * Grayskull:
 
@@ -230,8 +230,8 @@ Library
 
 Below ``Vec`` means any vector type.
 
-Grayskulll and Wormhole
-^^^^^^^^^^^^^^^^^^^^^^^
+Grayskull and Wormhole
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. code-block:: c++
 
@@ -396,8 +396,8 @@ For example:
     l_reg[LRegs::LReg1] = x;         // this is necessary at the end of the function
                                      // to preserve the value in LReg1 (if desired)
 
-Miscelaneous
-************
+Miscellaneous
+*************
 
 Register Pressure Management
 ++++++++++++++++++++++++++++
@@ -413,7 +413,7 @@ loads dst_reg[0] and dst_reg[1] into temporary LREGs (as expected).
 
 The compiler will not spill registers.  Exceeding the number of registers
 available will result in the cryptic: ``error: cannot store SFPU register
-(reigster spill?) - exiting!`` without a line number.
+(register spill?) - exiting!`` without a line number.
 
 The compiler does a reasonable job with lifetime analysis when assigning
 variables to registers.  Reloading or recalculating results helps the compiler
@@ -448,7 +448,7 @@ The ``SFPREPLAY`` instruction available on Wormhole allows the RISCV processor
 to submit up to 32 SFP instructions at once.  The compiler looks for sequences
 of instructions that repeat, stores these and then "replays" them later.
 
-The current implemention of this is very much first cut: it does not handle
+The current implementation of this is very much first cut: it does not handle
 kernels with rolled up loops very well.  Best performance is typically attained by
 unrolling the top level loop and then letting the compiler find the repetitions
 and replace them with ``SFPREPLAY``.  This works well when the main loop
@@ -494,15 +494,15 @@ Register Spilling
 +++++++++++++++++
 
 The compiler does not implement register spilling.  Since Grayskull only has 4
-LRegs, running out of registers is a common occurence.  If you see the
-following: ``error: cannot store SFPU register (reigster spill?) - exiting!``
+LRegs, running out of registers is a common occurrence.  If you see the
+following: ``error: cannot store SFPU register (register spill?) - exiting!``
 you have most likely run out of registers.
 
 Error Messages
 ++++++++++++++
 
 Unfortunately, many errors are attributed to the code in the wrapper rather than in the code
-being written.  For example, using an unitialized variable would show an error at a macro
+being written.  For example, using an uninitialized variable would show an error at a macro
 called by a wrapper function before showing the line number in the user's code.
 
 Function Calls
diff --git a/docs/public/installation.rst b/docs/public/installation.rst
index cfa7d589..8797b41c 100644
--- a/docs/public/installation.rst
+++ b/docs/public/installation.rst
@@ -38,7 +38,7 @@ Python Environment Installation
 
 It is strongly recommended to use virtual environments for each project utilizing PyBUDA and Python dependencies. Creating a new virtual environment with PyBUDA and libraries is very easy.
 
-Prerequisites (detailed sections below) for python envirnment installation are listed here:
+Prerequisites (detailed sections below) for python environment installation are listed here:
 
   * `Setup HugePages (below) <#setup-hugepages>`_
   * `PCI Driver Installation (below) <#pci-driver-installation>`_
diff --git a/docs/public/terminology.rst b/docs/public/terminology.rst
index 1ff12b47..c492ac63 100644
--- a/docs/public/terminology.rst
+++ b/docs/public/terminology.rst
@@ -27,7 +27,7 @@ The dense tensor math unit in Tensix. It performs bulk tensor math operations, s
 
 SFPU
 ----
-Tensix SIMD engine, used for various miscellaneous activations operations, such as exponents, square roots, softmax, topK, and others.
+Tensix SIMD engine, used for various miscellaneous activation operations, such as exponents, square roots, softmax, topK, and others.
 
 Unpacker
 --------
@@ -49,7 +49,7 @@ A collection of ops that fits onto one chip. In a typical workflow, epoch code w
 
 Buffer
 ------
-A reserved location in local memory, DRAM, or host memory. Buffers are used either as desinations for operation outputs, sources for operation inputs, or temporary locations for intermediate data.
+A reserved location in local memory, DRAM, or host memory. Buffers are used either as destinations for operation outputs, sources for operation inputs, or temporary locations for intermediate data.
 
 Pipe
 ----
diff --git a/docs/public/user_guide.rst b/docs/public/user_guide.rst
index 118b9269..9576389e 100644
--- a/docs/public/user_guide.rst
+++ b/docs/public/user_guide.rst
@@ -17,28 +17,30 @@ Compiling and running a PyBuda workload is as easy as:
   import pybuda
   import torch
   from transformers import BertModel, BertConfig
-  
-  # Download the model from huggingface
-  model = BertModel.from_pretrained("bert-base-uncased")
-  
-  # Wrap the pytorch model in a PyBuda module wrapper
-  module = pybuda.PyTorchModule("bert_encoder", model.encoder)
-  
-  # Create a tenstorrent device
-  tt0 = pybuda.TTDevice(
-      "tt0",
-      module=module,
-      arch=pybuda.BackendDevice.Wormhole_B0,
-      devtype=pybuda.BackendType.Silicon,
-  )
-  
-  # Create an input tensor
-  seq_len = 128
-  input = torch.randn(1, seq_len, model.config.hidden_size)
-  
-  # Compile and run inference
-  output_queue = pybuda.run_inference(inputs=[input])
-  print(output_queue.get())
+
+  # Guard in the main module to avoid creating subprocesses recursively.
+  if __name__ == "__main__":
+      # Download the model from huggingface
+      model = BertModel.from_pretrained("bert-base-uncased")
+
+      # Wrap the pytorch model in a PyBuda module wrapper
+      module = pybuda.PyTorchModule("bert_encoder", model.encoder)
+
+      # Create a tenstorrent device
+      tt0 = pybuda.TTDevice(
+          "tt0",
+          module=module,
+          arch=pybuda.BackendDevice.Wormhole_B0,
+          devtype=pybuda.BackendType.Silicon,
+      )
+
+      # Create an input tensor
+      seq_len = 128
+      input = torch.randn(1, seq_len, model.config.hidden_size)
+
+      # Compile and run inference
+      output_queue = pybuda.run_inference(inputs=[input])
+      print(output_queue.get())
 
 
 Framework Support
@@ -90,7 +92,7 @@ PyBuda API and workflow is flexible enough that some of these steps can be merge
 Devices
 *******
 
-PyBuda makes it easy to distribute a workload onto a heterogenous set of devices available to you. This can be one or more 
+PyBuda makes it easy to distribute a workload onto a heterogeneous set of devices available to you. This can be one or more 
 Tenstorrent devices, CPUs, or GPUs. Each device that will be used to run your workflow needs to be declared by creating the appropriate
 device type and giving it a unique name:
 
@@ -121,7 +123,7 @@ To run a module on a device, it needs to be "placed" on it
    tt0.place_module(mod)
 
 This tells PyBuda that module ``mod`` needs to be compiled and executed on device ``tt0``. In this case, ``mod`` is a native PyBuda module. To
-simiarly place a PyTorch module onto a Tenstorrent device, the module must be wrapped in a :py:class:`PyTorchModule<pybuda.PyTorchModule>` wrapper:
+similarly place a PyTorch module onto a Tenstorrent device, the module must be wrapped in a :py:class:`PyTorchModule<pybuda.PyTorchModule>` wrapper:
 
 .. code-block:: python
 
@@ -147,7 +149,7 @@ PyBuda provides all-in-one APIs for compiling and running workloads, :py:func:`r
 For inference, and simple training setups, this is the simplest way to get up and running.
 
 Alternatively, the models can be compiled in a separate step, using the :py:func:`initialize_pipeline<pybuda.initialize_pipeline>` call, 
-which optioanlly takes sample inputs, if none have been pushed into the first device. Once the compilation has completed, the user 
+which optionally takes sample inputs, if none have been pushed into the first device. Once the compilation has completed, the user 
 can run :py:func:`run_forward<pybuda.run_forward>` pass through the pipeline for inference, or a loop of 
 :py:func:`run_forward<pybuda.run_forward>`, :py:func:`run_backward<pybuda.run_backward>`, and :py:func:`run_optimizer<pybuda.run_optimizer>` 
 calls to manually implement a training loop:
@@ -165,10 +167,10 @@ calls to manually implement a training loop:
 CPU Fallback
 ************
 
-If there are operators in the workload that are unsuppored by PyBuda, the user can create a CPUDevice and place module containing those 
+If there are operators in the workload that are unsupported by PyBuda, the user can create a CPUDevice and place module containing those 
 operators onto that CPUDevice. If enabled, PyBuda is capable of doing this automatically.
 
-If a TTDevice contains unsuppored operators, during compilation, the device will be split into mupltiple devices (TTDevice and CPUDevice). If
+If a TTDevice contains unsupported operators, during compilation, the device will be split into multiple devices (TTDevice and CPUDevice). If
 the CPUDevice is at the front of the pipeline (i.e. the unsupported ops are in the first half of the graph), any inputs pushed to the TTDevice
 will be redirected to the correct CPUDevice. 
 
@@ -214,7 +216,7 @@ Output queues hold PyBuda tensors. For each PyBuda tensor, user can convert it b
     output_in_tf = output_q[0].to_framework("tensorflow")
 
 Advanced training scenarios sometimes require accumulated gradients to be retrieved and analyzed. For those cases, PyBuda provides an 
-:py::func:`API<pybuda.get_parameter_gradients>` that retrieves a dictionary of all currently accumulated gradients on a device. This can be used to 
+:py:func:`API<pybuda.get_parameter_gradients>` that retrieves a dictionary of all currently accumulated gradients on a device. This can be used to 
 debug or analyze data, or even run a manual optimizer and push new weights onto the device.
 
 Saving and Loading Models
@@ -623,20 +625,22 @@ Here is a simple example to (1) tag operations of interest and (2) fetch interme
            matmul2 = pybuda.op.Matmul("matmul2", matmul1_gelu, self.weights2)
            return matmul2
 
-   # Configure Pybuda compilation options to include a list of operations to collect intermediate tensors
-   tagged_operations = ["matmul1", "gelu"]
-   pybuda.set_configuration_options(op_intermediates_to_save=tagged_operations)
+   # Guard in the main module to avoid creating subprocesses recursively.
+   if __name__ == "__main__":
+       # Configure Pybuda compilation options to include a list of operations to collect intermediate tensors
+       tagged_operations = ["matmul1", "gelu"]
+       pybuda.set_configuration_options(op_intermediates_to_save=tagged_operations)
 
-   # Invoke the run_inference API to create device, compile and run module on device:
-   output_q = pybuda.run_inference(PyBudaTestModule("test_module"), inputs=[torch.randn(1, 32, 32)])
+       # Invoke the run_inference API to create device, compile and run module on device:
+       output_q = pybuda.run_inference(PyBudaTestModule("test_module"), inputs=[torch.randn(1, 32, 32)])
 
-   # After running inference, the intermediates queue will contain the ordered list of tagged intermediates
-   intermediates_queue = pybuda.get_intermediates_queue()
-   matmul1_tensor, gelu_tensor = intermediates_queue.get()
+       # After running inference, the intermediates queue will contain the ordered list of tagged intermediates
+       intermediates_queue = pybuda.get_intermediates_queue()
+       matmul1_tensor, gelu_tensor = intermediates_queue.get()
 
-   # Print tensor values recorded from device inference
-   print(matmul1_tensor)
-   print(gelu_tensor)
+       # Print tensor values recorded from device inference
+       print(matmul1_tensor)
+       print(gelu_tensor)
 
 
 Multiple Devices
@@ -647,7 +651,7 @@ Using Multiple Tenstorrent Devices
 
 PyBuda makes it easy to parallelize workloads onto multiple devices. A single :py:class:`TTDevice<pybuda.TTDevice>` can be used as a wrapper to any number of available 
 Tenstorrent devices accessible to the host - either locally or through ethernet. The PyBuda compiler will then break up the workload over
-assigned devices using either pipeline or model parllelism strategies, or a combination of both.
+assigned devices using either pipeline or model parallelism strategies, or a combination of both.
 
 The easiest way to use all available hardware is to set ``num_chips`` parameter in :py:class:`TTDevice<pybuda.TTDevice>` to 0, which instructs it to auto-detect and use everything it can find. 
 However, ``num_chips`` and ``chip_ids`` parameters can be used to select a subset of available hardware:
@@ -765,7 +769,7 @@ The following Python code generates a Multi-Model TTI in a manner identical to t
 
   model_binary_loc = "device_images_to_merge"
   models_to_merge = ["bert_large", "deit", "hrnet", "inception", "mobilenet_v1", "mobilenet_v2", "mobilenet_v3", "resnet", "unet", "vit"]
-  target_arch = "wormhole_b0
+  target_arch = "wormhole_b0"
   merged_model_location = "multi_model_workload.tti"
 
   # Individual Model Generation Code Goes Here
@@ -776,7 +780,7 @@ The following Python code generates a Multi-Model TTI in a manner identical to t
 
 During the model fusion process, the API presented above is responsible for performing memory reallocation. Users may be interested in the memory footprint of the fused model (both Device and Host DRAM).
 
-To fullfil this requirement, the tool reports memory utilization post reallocation. An example using a model compiled for Wormhole (with 6 Device and upto 4 Host DRAM channels) is provided below.
+To fulfill this requirement, the tool reports memory utilization post reallocation. An example using a model compiled for Wormhole (with 6 Device and up to 4 Host DRAM channels) is provided below.
 
 .. code-block:: bash
 

From 05ff6c74df30a2700c3c6145cb3ffabc2f6714b6 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Mon, 24 Jun 2024 07:57:04 +0000
Subject: [PATCH 25/29] Connect multiple open nodes

(cherry picked from commit 8a4121eb946d9929d9085d99f7412e950103fa1b)
---
 pybuda/pybuda/op_repo/shapes.py      |  2 +-
 pybuda/test/README.debug.md          |  4 +-
 pybuda/test/random/rgg/algorithms.py | 98 ++++++++++++++++++----------
 pybuda/test/random/rgg/base.py       | 16 ++++-
 pybuda/test/random/rgg/config.py     |  4 +-
 pybuda/test/random/rgg/datatypes.py  |  5 +-
 pybuda/test/random/rgg/utils.py      | 25 ++++++-
 pybuda/test/random/test_graphs.py    | 78 +++++++++++++++++++---
 pybuda/test/utils.py                 | 11 ++++
 9 files changed, 190 insertions(+), 53 deletions(-)

diff --git a/pybuda/pybuda/op_repo/shapes.py b/pybuda/pybuda/op_repo/shapes.py
index 814a1d36..ebe42814 100644
--- a/pybuda/pybuda/op_repo/shapes.py
+++ b/pybuda/pybuda/op_repo/shapes.py
@@ -56,4 +56,4 @@ def randomize_size(n: int, rng_shape: Random) -> int:
     Returns:
         int: random size of an dimension
     '''
-    return n + (rng_shape.randint(0, 1) * 2 - 1) * rng_shape.randint(1, int(n/2))
+    return n + (rng_shape.randint(0, 1) * 2 - 1) * rng_shape.randint(0, n // 2)
diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index f2b4b33d..3153dbdf 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -10,4 +10,6 @@
  * MAX\_OP\_SIZE\_PER\_DIM: Maximum size of an operator dimension. Smaller operator size results in fewer failed tests. (default: 512)
  * MIN_MICROBATCH_SIZE: Minimal size of microbatch of an input tensor. (default: 1)
  * MAX_MICROBATCH_SIZE: Maximum size of microbatch of an input tensor. (default: 8)
- * NUM\_OF\_NODES: Maximum number of nodes to be generated by RGG. (default: 10)
+ * NUM\_OF\_NODES\_MIN: Minimal number of nodes to be generated by RGG. (default: 5)
+ * NUM\_OF\_NODES\_MAX: Maximum number of nodes to be generated by RGG. (default: 10)
+ * NUM\_OF\_FORK\_JOINS\_MAX: Maximum number of fork joins to be generated by random graph algorithm in RGG. (default: 50)
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 6dee31bd..9e7880b2 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -5,6 +5,7 @@
 
 
 import random
+from typing import List
 from loguru import logger
 
 from pybuda.op_repo import OperatorDefinition
@@ -53,6 +54,7 @@ def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
             node.index = op_index_cnt
 
         # Storing output values if needed as explicit input for later operator
+        logger.trace("Setting out_value for nodes")
         for node in nodes:
             # setting default output variable name
             node.out_value = "v"
@@ -62,6 +64,7 @@ def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
                     input_node.out_value = input_node.operator_name()
                     logger.trace(f"Set out_value = {input_node.out_value}")
 
+        logger.trace("Setting input nodes for open nodes")
         open_nodes = NodeUtils.get_open_nodes(nodes)
         logger.trace(f"Open nodes {StrUtils.nodes_to_str(open_nodes)}")
 
@@ -78,10 +81,12 @@ def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
                     graph.input_nodes.append(input_node)
                 node.inputs.append(input_node)
 
+        logger.trace("Generating random settings for operator parameters")
         # Generate random values for operator parameters
         for node in nodes:
             node.constructor_kwargs = RandomUtils.constructor_kwargs(node.operator, node.constructor_kwargs, rng_params)
             node.forward_kwargs = RandomUtils.forward_kwargs(node.operator, node.forward_kwargs, rng_params)
+        logger.trace("Random settings for operator parameters generated")
 
     @classmethod
     def validate_graph(cls, graph: RandomizerGraph):
@@ -111,10 +116,17 @@ def validate_graph(cls, graph: RandomizerGraph):
 
     @classmethod
     def prepare_graph(cls, graph: RandomizerGraph, rng_params: random.Random):
+        logger.trace("Initializing nodes")
         cls.init_nodes(graph, rng_params)
+        logger.trace("Nodes initialized")
+
+        logger.trace("Validating graph")
         cls.validate_graph(graph)
+        logger.trace("Graph validated")
 
+        logger.trace("Serializing nodes")
         nodes_str = StrUtils.nodes_to_str(graph.nodes)
+        logger.trace("Nodes serialized")
         logger.trace(f"Nodes: \n{nodes_str}")
 
 
@@ -143,11 +155,13 @@ def _init_default_constructor_params(self, node: RandomizerNode):
             node.constructor_kwargs["out_channels"] = node.output_shape[1]
 
     # Build graph of random operators via random graph building algorithm
-    # Graph contains between num_of_nodes/2 and num_of_nodes nodes
+    # Graph contains between num_of_nodes_min and num_of_nodes_max nodes
     # Graph is constructed backwards starting from end node
     # In each step a random operator is selected and a new node is created
-    # New node is connected to the last node and optionally to a random node with the same input shape
-    # When new node is connected to 2 nodes graph contains a fork join
+    # Output of new node is connected as input to the multiple open nodes randomly selected which has the same input shape
+    # When new node is connected to more than one node, graph constructs a fork join
+    # Output shape of first node is random
+    # Output shape of other nodes is based on next input shape of a randomly picked open node
     # Input shapes for each node are calculated based on output shape of the node
     def build_graph(self, test_context: RandomizerTestContext):
         '''Implementation of the random graph building algorithm'''
@@ -164,46 +178,60 @@ def build_graph(self, test_context: RandomizerTestContext):
         # Initialize random number generators for parameters
         rng_params = random.Random(test_context.parameters.random_seed)
 
-        num_of_nodes = self.randomizer_config.num_of_nodes
+        fork_join_counter = 0
+        fork_join_max = test_context.randomizer_config.num_fork_joins_max
 
-        # Building the graph with number of nodes between n/2 and n
-        # num_of_nodes defines max number of nodes in the graph
-        for _ in range(rng_graph.randint(int(num_of_nodes/2), num_of_nodes)):
+        # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
+        num_of_nodes = rng_graph.randint(self.randomizer_config.num_of_nodes_min, self.randomizer_config.num_of_nodes_max) 
+        for node_index in range(num_of_nodes, 0, -1):
             # Choose operator randomly based on rng
             op1 = self._get_random_operator(rng_graph)
 
-            # Last node defines output shape for next node to create
-            last_node: RandomizerNode = None
-            # Random node is selected by matching the same input shape to support fork joins
-            # TODO random_node -> random_nodes, select all random_nodes instead of just one
-            # TODO: obsolete last_node in flavor of random_nodes
-            random_node: RandomizerNode = None
-
-            if len(nodes) > 0:
-                # If graph is not empty find previusly added node
-                last_node = nodes[0]
+            # Find all open nodes
+            open_nodes = NodeUtils.get_open_nodes(nodes)
 
-            if len(nodes) == 0:
-                # Setting output shape for the first node
+            # Select output shape for the new node
+            if len(open_nodes) == 0:
+                # For the first node set output shape as random shape
                 output_shape = RandomUtils.random_shape_from_config(self.randomizer_config, rng_shape)
             else:
-                # Setting output shape based on last node input shapes
-                input_shapes = last_node.input_shapes
-                output_shape = input_shapes[len(last_node.inputs)]
-
-            # Find open nodes with input shape mathing the output shape of new node
+                # For other nodes, output shape is based on input shapes of a random open node
+                # Select one of open nodes randomly
+                random_open_node: RandomizerNode = rng_graph.choice(open_nodes)
+                # Setting output shape based on input shapes of the random open node
+                input_shapes = random_open_node.input_shapes
+                output_shape = input_shapes[len(random_open_node.inputs)]
+
+            # Find all other open nodes with input shape mathing the output shape of new node
             open_nodes = NodeUtils.get_open_nodes_with_input_shape(nodes, output_shape)
 
-            if len(open_nodes) > 0:
-                # Randomly selecting one of the open nodes
-                random_node = rng_graph.choice(open_nodes)
+            # Random nodes are selected by matching the same input shape as new node
+            # Closing multiple nodes will construct fork joins
+            random_nodes: List[RandomizerNode]
 
-            if last_node is not None and random_node is not None and last_node == random_node:
-                # Skip random_node if it's the same as last_node
-                random_node = None
+            if len(open_nodes) > 0:
+                # There must be at least one node to close
+                subset_count_min = max(1, len(open_nodes) // 2)
+                subset_count_max = len(open_nodes)
+                # Choose a random number of nodes to close
+                subset_count = rng_graph.randint(subset_count_min, subset_count_max)
+
+                # Limit number of fork joins
+                subset_count = min(subset_count, fork_join_max - fork_join_counter + 1)
+
+                # Increase fork join counter
+                new_fork_join = subset_count - 1
+                if new_fork_join > 0:
+                    logger.trace(f"Constructing {new_fork_join} new fork join(s) from operator op{node_index} {op1.name}")
+                fork_join_counter += new_fork_join
+
+                # Select random subset of open nodes to close
+                random_nodes = rng_graph.sample(open_nodes, subset_count)
+            else:
+                random_nodes = []
 
-            # Closing nodes are last_node and optionally random_node
-            closing_nodes = [closing_node for closing_node in [last_node, random_node] if closing_node is not None]
+            # Closing nodes are all random open nodes
+            closing_nodes = random_nodes
 
             # Creating new node
             node = RandomizerNode(operator=op1, output_shape=output_shape)
@@ -217,12 +245,14 @@ def build_graph(self, test_context: RandomizerTestContext):
                 for _ in range(rng_graph.randint(1, closing_node.operator.input_num - len(closing_node.inputs))):
                     # currently only if next input of closing node matches the output shape a closing node will be actually closed
                     # TODO check all inputs for matching shapes not just next one
-                    # if second operands is different shape than first one it will most likely not be closed with an internal node but with external input
-                    # e.x. second operand of matmul usually connect to external input instead of an internal node
                     if closing_node.input_shapes[len(closing_node.inputs)] == node.output_shape:
                         closing_node.inputs.append(node)
 
             open_nodes.append(node)
             nodes.insert(0, node)
 
+        logger.trace(f"Graph built with {len(nodes)} nodes")
+
+        logger.trace("Preparing graph")
         GraphNodeSetup.prepare_graph(graph, rng_params)
+        logger.trace("Graph prepared")
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 255e6b0c..e4f8d6e9 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -14,6 +14,7 @@
 from pybuda.verify import verify_module, VerifyConfig
 from pybuda.op_repo import OperatorRepository
 from test.conftest import TestDevice
+from test.utils import Timer
 from .datatypes import RandomizerNode, RandomizerGraph, RandomizerParameters, RandomizerConfig, ExecutionContext
 from .datatypes import RandomizerTestContext
 from .datatypes import TensorShape
@@ -231,7 +232,10 @@ def run(self, graph_builder: GraphBuilder):
         logger.debug(f"Parameters test_index: {parameters.test_index} random_seed: {parameters.random_seed} test_device: {parameters.test_device}")
 
         # build random graph for the specified parameters
+        logger.trace("Building graph started")
+        graph_duration = Timer()
         self.build_graph(graph_builder)
+        logger.trace("Building graph completed")
         graph = self.test_context.graph
         logger.debug(f"Generating graph model {GraphUtils.short_description(graph)}")
         if randomizer_config.print_graph:
@@ -249,16 +253,22 @@ def run(self, graph_builder: GraphBuilder):
             # saving test source code to file for debugging purposes
             self.save_test(test_code_str)
 
+        logger.debug(f"Graph built in: {graph_duration.get_duration():.4f} seconds")
+
         if randomizer_config.run_test:
             # instantiate PyBuda model
             model = self.build_model()
             # perform model validation
-            self.verify(model)
+            try:
+                verify_duration = Timer()
+                self.verify(model)
+            finally:
+                logger.debug(f"Test verified in: {verify_duration.get_duration():.4f} seconds")
         else:
             logger.info("Skipping test run")
 
 
-def process_test(test_index: int, random_seed: int, test_device: TestDevice, randomizer_config: RandomizerConfig, graph_builder_type: Type[GraphBuilder], framework: Framework):
+def process_test(test_name: str, test_index: int, random_seed: int, test_device: TestDevice, randomizer_config: RandomizerConfig, graph_builder_type: Type[GraphBuilder], framework: Framework):
     '''
     Process a single randomizer test.
 
@@ -277,7 +287,7 @@ def process_test(test_index: int, random_seed: int, test_device: TestDevice, ran
     # instantiate parameters
     parameters = RandomizerParameters(test_index, random_seed, test_device, framework_name=framework.framework_name.lower(), graph_builder_name=graph_builder.get_name())
     # instantiate test_context
-    test_context = RandomizerTestContext(randomizer_config=randomizer_config, parameters=parameters, graph=None)
+    test_context = RandomizerTestContext(randomizer_config=randomizer_config, parameters=parameters, graph=None, test_name=test_name)
     # instantiate graph_builder
     model_builder = framework.ModelBuilderType()
     # instantiate runner
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index 3941e2fa..58f11d8e 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -29,6 +29,8 @@ def get_randomizer_config_default():
         # op_size_per_dim_max=int(os.environ.get("MAX_OP_SIZE_PER_DIM", 512)),
         microbatch_size_min=int(os.environ.get("MIN_MICROBATCH_SIZE", 1)),
         microbatch_size_max=int(os.environ.get("MAX_MICROBATCH_SIZE", 8)),
-        num_of_nodes=int(os.environ.get("NUM_OF_NODES", 10)),
+        num_of_nodes_min=int(os.environ.get("NUM_OF_NODES_MIN", 5)),
+        num_of_nodes_max=int(os.environ.get("NUM_OF_NODES_MAX", 10)),
+        num_fork_joins_max=int(os.environ.get("NUM_OF_FORK_JOINS_MAX", 50)),
     )
     return randomizer_config
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 1b120534..8a2c6394 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -90,7 +90,9 @@ class RandomizerConfig:
     op_size_per_dim_max: int = 512
     microbatch_size_min: int = 1
     microbatch_size_max: int = 8
-    num_of_nodes: int = 10
+    num_of_nodes_min: int = 5
+    num_of_nodes_max: int = 10
+    num_fork_joins_max: int = 50
 
 
 @dataclass
@@ -100,3 +102,4 @@ class RandomizerTestContext:
     # framework: Framework
     # graph_builder: GraphBuilder
     graph: Optional[RandomizerGraph]  # graph will be constructed later during test processing
+    test_name: str = "Default"
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index af2aee84..dc533199 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -40,16 +40,35 @@ def camel_case_to_snake_case(camel_case: str) -> str:
         snake_case = re.sub(pattern, '_', camel_case).lower()
         return snake_case
 
+    @staticmethod
+    def text_to_snake_case(text: str) -> str:
+        text = text.lower()
+        pattern = re.compile(r'\ +')
+        snake_case = re.sub(pattern, '_', text).lower()
+        return snake_case
+
     @classmethod
     def test_id(cls, test_context: RandomizerTestContext) -> str:
         parameters = test_context.parameters
         graph_builder_snake_case = cls.camel_case_to_snake_case(parameters.graph_builder_name)
-        test_id = f"{parameters.framework_name}_{graph_builder_snake_case}_{parameters.test_index}_{parameters.random_seed}"
+        test_name = cls.text_to_snake_case(test_context.test_name)
+        test_id = f"{parameters.framework_name}_{graph_builder_snake_case}_{test_name}_{parameters.test_index}_{parameters.random_seed}"
         return test_id
 
     @staticmethod
     def nodes_to_str(nodes: List[RandomizerNode]) -> str:
-        nodes_str = "\n".join([f"    {node}" for node in nodes])
+        '''Converts list of nodes to string representation
+        Used for debugging purposes
+        
+        Args:
+            nodes (List[RandomizerNode]): list of nodes
+
+        Returns:
+            str: string representation of nodes
+        '''
+        # TODO Very slow -> implement in a faster way
+        # nodes_str = "\n".join([f"    {node}" for node in nodes])
+        nodes_str = ""
         return nodes_str
 
 
@@ -155,7 +174,7 @@ def is_previous_node(node: RandomizerNode, previous_node: RandomizerNode) -> boo
 
     @staticmethod
     def is_open(node: RandomizerNode) -> bool:
-        return (node.inputs is None or len(node.inputs) == 0) or (node.operator.input_num > 1 and len(node.inputs) < node.operator.input_num)
+        return (len(node.inputs) if node.inputs else 0)  < node.operator.input_num
 
     @classmethod
     def get_open_nodes(cls, nodes: List[RandomizerNode]) -> List[RandomizerNode]:
diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index 13eee3c5..251f3301 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -21,17 +21,28 @@
 
 class FrameworkTestUtils:
 
-    @staticmethod
-    def copy_framework(framework: Framework, skip_operators: Tuple[str] = []):
+    @classmethod
+    def copy_framework(cls, framework: Framework, skip_operators: Tuple[str] = []) -> Framework:
         framework0 = framework
         framework = copy(framework)
         framework.operator_repository = copy(framework.operator_repository)
-        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name not in skip_operators]
+        cls.skip_operators(framework, skip_operators)
         assert len(framework.operator_repository.operators) + len(skip_operators) == len(framework0.operator_repository.operators), "Operators count should match after skipping operators"
         return framework
 
-    @staticmethod
-    def copy_operator(framework: Framework, operator_name: str) -> OperatorDefinition:
+    @classmethod
+    def skip_operators(cls, framework: Framework, skip_operators: Tuple[str] = []) -> None:
+        initial_operator_count = len(framework.operator_repository.operators)
+        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name not in skip_operators]
+        assert len(framework.operator_repository.operators) + len(skip_operators) == initial_operator_count, "Operators count should match after skipping operators"
+
+    @classmethod
+    def allow_operators(cls, framework: Framework, allow_operators: Tuple[str] = []) -> None:
+        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name in allow_operators]
+        assert len(allow_operators) == len(framework.operator_repository.operators), "Operators count should match allowing skipping operators"
+
+    @classmethod
+    def copy_operator(cls, framework: Framework, operator_name: str) -> OperatorDefinition:
         operators = framework.operator_repository.operators
 
         i, operator = next(((i, operator) for i, operator in enumerate(operators) if operator.name == operator_name), (None, None))
@@ -78,6 +89,24 @@ def healty_pybuda():
 
         return framework
 
+    @staticmethod
+    def pybuda_matmul_joins():
+        SKIP_OPERATORS = (
+        )
+
+        framework = FrameworkTestUtils.copy_framework(Frameworks.PYBUDA.value, SKIP_OPERATORS)
+
+        ALLOW_OPERATORS = (
+            "relu",
+            "tanh",
+            "add",
+            "matmul",
+        )
+
+        FrameworkTestUtils.allow_operators(framework, ALLOW_OPERATORS)
+
+        return framework
+
     @staticmethod
     def healty_pytorch():
         SKIP_OPERATORS = (
@@ -91,6 +120,7 @@ def healty_pytorch():
         return framework
     
     PYBUDA = healty_pybuda()
+    PYBUDA_MATMUL_JOINS = pybuda_matmul_joins()
     PYTORCH = healty_pytorch()
 
 
@@ -111,11 +141,39 @@ def test_random_graph_algorithm_pybuda(test_index, random_seeds, test_device, ra
     # randomizer_config.op_size_per_dim_max = 256
     randomizer_config.microbatch_size_min = 1
     randomizer_config.microbatch_size_max = 8
-    randomizer_config.num_of_nodes = 10
+    randomizer_config.num_of_nodes_min = 5
+    randomizer_config.num_of_nodes_max = 10
+    randomizer_config.num_fork_joins_max = 5
+
+    # TODO random_seed instead of random_seeds
+    random_seed = random_seeds[test_index]
+    process_test("Default", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+
+
+@pytest.mark.parametrize("framework", [
+    FrameworksHealthy.PYBUDA_MATMUL_JOINS.value,
+])
+def test_random_graph_algorithm_pybuda_matmul_joins(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
+    # adjust randomizer_config
+    randomizer_config = copy(randomizer_config)
+    # randomizer_config.debug_shapes = True
+    # randomizer_config.verify_shapes = True
+    randomizer_config.dim_min = 3
+    randomizer_config.dim_max = 4
+    randomizer_config.op_size_per_dim_min = 4
+    # randomizer_config.op_size_per_dim_min = 16
+    randomizer_config.op_size_per_dim_max = 8
+    # randomizer_config.op_size_per_dim_max = 64
+    # randomizer_config.op_size_per_dim_max = 256
+    randomizer_config.microbatch_size_min = 1
+    randomizer_config.microbatch_size_max = 8
+    randomizer_config.num_of_nodes_min = 10
+    randomizer_config.num_of_nodes_max = 15
+    randomizer_config.num_fork_joins_max = 10
 
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
-    process_test(test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+    process_test("Matmul Joins", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
 
 
 @pytest.mark.parametrize("framework", [
@@ -135,8 +193,10 @@ def test_random_graph_algorithm_pytorch(test_index, random_seeds, test_device, r
     # randomizer_config.op_size_per_dim_max = 256
     randomizer_config.microbatch_size_min = 1
     randomizer_config.microbatch_size_max = 8
-    randomizer_config.num_of_nodes = 5
+    randomizer_config.num_of_nodes_min = 3
+    randomizer_config.num_of_nodes_max = 5
+    randomizer_config.num_fork_joins_max = 5
 
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
-    process_test(test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+    process_test("Default", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
diff --git a/pybuda/test/utils.py b/pybuda/test/utils.py
index fd34702e..8d8d2e30 100644
--- a/pybuda/test/utils.py
+++ b/pybuda/test/utils.py
@@ -23,3 +23,14 @@ def download_model(download_func, *args, num_retries=3, timeout=180, **kwargs):
     logger.error("Failed to download the model after multiple retries.")
     assert False, "Failed to download the model after multiple retries."
     
+
+class Timer:
+
+    def __init__(self):
+        self.start_time = time.perf_counter()
+
+    def get_duration(self):
+        end_time = time.perf_counter()
+        duration = end_time - self.start_time
+        return duration
+

From 42bafb00588fe60b26e538733f3a528204a745dc Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Thu, 27 Jun 2024 14:10:33 +0000
Subject: [PATCH 26/29] Update submodules

---
 third_party/benchmarking     | 1 +
 third_party/buda-model-demos | 2 +-
 third_party/budabackend      | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)
 create mode 160000 third_party/benchmarking

diff --git a/third_party/benchmarking b/third_party/benchmarking
new file mode 160000
index 00000000..da7872c7
--- /dev/null
+++ b/third_party/benchmarking
@@ -0,0 +1 @@
+Subproject commit da7872c76fbd1e5a71e11c1c9fec7cd090f7f1fd
diff --git a/third_party/buda-model-demos b/third_party/buda-model-demos
index 2de3d760..a6739e0e 160000
--- a/third_party/buda-model-demos
+++ b/third_party/buda-model-demos
@@ -1 +1 @@
-Subproject commit 2de3d7607fc973446bef0664cb5d32aa8e0198a9
+Subproject commit a6739e0ef00565c4b5c4ee2d8251c9f53428b888
diff --git a/third_party/budabackend b/third_party/budabackend
index f146c92f..44ae26b5 160000
--- a/third_party/budabackend
+++ b/third_party/budabackend
@@ -1 +1 @@
-Subproject commit f146c92f49560bd2c34884f00adc53ba935ca1dc
+Subproject commit 44ae26b5edad8acf7dd5b4335f716ddc617ee8c4

From ab395c87880baf8eca91af21949a91f2b58b0cc6 Mon Sep 17 00:00:00 2001
From: Vladimir Canic <vcanic@tenstorrent.com>
Date: Fri, 21 Jun 2024 13:43:15 +0000
Subject: [PATCH 27/29] Add path to erisc hex files.

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index bc60db71..7fff53fe 100644
--- a/Makefile
+++ b/Makefile
@@ -49,6 +49,8 @@ DOCSDIR = $(OUT)/docs
 SUBMODULESDIR = $(OUT)/submodules
 TORCHVISIONDIR = build_deps/vision
 
+export TT_BACKEND_ERISC_PRECOMPILED_BINARIES_PATH=./erisc_hex/
+
 # Top level flags, compiler, defines etc.
 
 #WARNINGS ?= -Wall -Wextra

From b9c0cc0389ae2036fb42f90033f8ff118489489f Mon Sep 17 00:00:00 2001
From: Vladimir Canic <vcanic@tenstorrent.com>
Date: Mon, 24 Jun 2024 10:24:48 +0000
Subject: [PATCH 28/29] Update ERISC path in setup.py.

---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e92b2e88..b9f2b95a 100644
--- a/setup.py
+++ b/setup.py
@@ -146,10 +146,11 @@
 
 if "BACKEND_ARCH_NAME" in os.environ and os.environ["BACKEND_ARCH_NAME"] == "wormhole_b0" or os.environ["BACKEND_ARCH_NAME"] == "blackhole":
     bbe_files["firmware_erisc_hex"] = {
-        "path": "build/src/firmware/riscv/targets/erisc_app/out",
+        # "path": "build/src/firmware/riscv/targets/erisc_app/out",
+        "path": "erisc_hex",
         "files": [
             "erisc_app.hex",
-            "erisc_app.elf",
+            # "erisc_app.elf",
             "erisc_app.iram.hex",
             "erisc_app.l1.hex",
             "split_iram_l1"

From 8772356f2b02a4d226cb8a530d2e094222bbfbb2 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Thu, 27 Jun 2024 14:16:24 +0000
Subject: [PATCH 29/29] Update gitlab action to build for wormhole_b0

---
 .github/workflows/build-artifacts.yml       | 15 +++++++++++++--
 .github/workflows/post-commit-workflow.yml  | 13 -------------
 .github/workflows/pull-request-workflow.yml | 13 -------------
 3 files changed, 13 insertions(+), 28 deletions(-)
 delete mode 100644 .github/workflows/post-commit-workflow.yml
 delete mode 100644 .github/workflows/pull-request-workflow.yml

diff --git a/.github/workflows/build-artifacts.yml b/.github/workflows/build-artifacts.yml
index 5e52e7e8..e272c417 100644
--- a/.github/workflows/build-artifacts.yml
+++ b/.github/workflows/build-artifacts.yml
@@ -3,15 +3,26 @@ name: Build artifacts
 on:
   workflow_dispatch:
   workflow_call:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
 
 env:
   PYTHON_VERSION: "python3.10"
 
 jobs:
   build-artifacts:
+    
     strategy:
       matrix:
-        arch: ["grayskull"]
+        include:
+          - arch: grayskull
+            env_script: env_for_silicon.sh
+          - arch: wormhole_b0
+            env_script: env_for_wormhole_b0.sh
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
@@ -21,4 +32,4 @@ jobs:
       - name: Update submodule
         run: git submodule update --init --recursive
       - name: Build for ${{ matrix.arch }}
-        run: source env_for_silicon.sh
\ No newline at end of file
+        run: source ${{ matrix.env_script }}
diff --git a/.github/workflows/post-commit-workflow.yml b/.github/workflows/post-commit-workflow.yml
deleted file mode 100644
index ceb7d58d..00000000
--- a/.github/workflows/post-commit-workflow.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: Post commit workflow
-
-on:
-  workflow_dispatch:
-  workflow_call:
-  push:
-    branches:
-      - main
-
-jobs:
-  build-artifacts:
-    uses: ./.github/workflows/build-artifacts.yml
-    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/pull-request-workflow.yml b/.github/workflows/pull-request-workflow.yml
deleted file mode 100644
index c5fbe795..00000000
--- a/.github/workflows/pull-request-workflow.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: Pull request workflow
-
-on:
-  workflow_dispatch:
-  workflow_call:
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  build-artifacts:
-    uses: ./.github/workflows/build-artifacts.yml
-    secrets: inherit
\ No newline at end of file