From 87c2ca340b9ea34b4c9c66ac1c9d0ce6fbb9fd57 Mon Sep 17 00:00:00 2001
From: Sterling Taylor <166402033+staylorTT@users.noreply.github.com>
Date: Thu, 1 Aug 2024 15:37:59 -0500
Subject: [PATCH 001/116] Delete
 pybuda/test/model_demos/models/tri_basic_2/model directory

Cleanup
---
 .../models/tri_basic_2/model/__init__.py      |   0
 .../models/tri_basic_2/model/semseg.py        | 145 ------------------
 2 files changed, 145 deletions(-)
 delete mode 100644 pybuda/test/model_demos/models/tri_basic_2/model/__init__.py
 delete mode 100644 pybuda/test/model_demos/models/tri_basic_2/model/semseg.py

diff --git a/pybuda/test/model_demos/models/tri_basic_2/model/__init__.py b/pybuda/test/model_demos/models/tri_basic_2/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py b/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py
deleted file mode 100644
index f575869c..00000000
--- a/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2023 Toyota Research Institute.  All rights reserved.
-
-import math
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class BasicResidualBlock(nn.Module):
-
-    def __init__(self, *, in_channels, out_channels, stride=1, dilation_rate=1):
-        super().__init__()
-        if in_channels == out_channels and stride == 1:
-            self.shortcut = nn.Identity()
-        else:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(out_channels),
-            )
-
-        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=dilation_rate,
-                               dilation=dilation_rate, bias=False)
-        self.norm1 = nn.BatchNorm2d(out_channels)
-        self.activation = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=dilation_rate, dilation=dilation_rate,
-                               bias=False)
-        self.norm2 = nn.BatchNorm2d(out_channels)
-
-    def forward(self, inputs):
-        shortcut = self.shortcut(inputs)
-
-        outputs = self.activation(self.norm1(self.conv1(inputs)))
-        outputs = self.norm2(self.conv2(outputs))
-        outputs = outputs + shortcut
-        outputs = self.activation(outputs)
-        return outputs
-
-
-def resnet_group(*, block_func, in_channels, out_channels, stride, num_blocks, dilation_rates=[1]):
-    assert len(dilation_rates) > 0
-
-    residual_blocks = [
-        block_func(in_channels=in_channels, out_channels=out_channels, stride=stride, dilation_rate=dilation_rates[0])
-    ]
-    for idx in range(1, num_blocks):
-        residual_blocks.append(block_func(in_channels=out_channels, out_channels=out_channels, stride=1,
-                                          dilation_rate=dilation_rates[idx % len(dilation_rates)]))
-    return nn.Sequential(*residual_blocks)
-
-
-class Fpn(nn.Module):
-
-    def __init__(self, *, in_channels, out_channels):
-        super().__init__()
-
-        idxs = []
-        convs = []
-        for idx, channels in enumerate(in_channels):
-            idxs.append(idx)
-            convs.append(nn.Conv2d(channels, out_channels, kernel_size=1, bias=True))
-        self.idxs = idxs[::-1]
-        self.convs = nn.ModuleList(convs[::-1])
-
-        self.upsample2 = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
-
-    def forward(self, group_outputs: List[torch.Tensor]):
-        outputs = None
-        for idx, module in enumerate(self.convs):
-            current = module(group_outputs[self.idxs[idx]])
-            if outputs is None:
-                outputs = current
-            else:
-                outputs = self.upsample2(outputs) + current
-
-        return outputs
-
-
-class BasicResNet(nn.Module):
-
-    def __init__(self, hparams, *, num_blocks, num_channels, dilation_rates):
-        super().__init__()
-        assert len(num_blocks) == 4
-        assert len(num_channels) == len(num_blocks)
-        assert len(dilation_rates) == len(num_blocks)
-
-        self.num_channels = num_channels
-
-        self.conv_in = nn.Conv2d(3, num_channels[0], kernel_size=7, padding=3, stride=2, bias=False)
-        self.norm_in = nn.BatchNorm2d(num_channels[0])
-        self.activation_in = nn.ReLU(inplace=True)
-        self.pool_in = nn.MaxPool2d(kernel_size=2)
-
-        self.group1 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[0],
-                                   out_channels=num_channels[0], stride=1, num_blocks=num_blocks[0],
-                                   dilation_rates=dilation_rates[0])
-        self.group2 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[0],
-                                   out_channels=num_channels[1], stride=2, num_blocks=num_blocks[1],
-                                   dilation_rates=dilation_rates[1])
-        self.group3 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[1],
-                                   out_channels=num_channels[2], stride=2, num_blocks=num_blocks[2],
-                                   dilation_rates=dilation_rates[2])
-        self.group4 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[2],
-                                   out_channels=num_channels[3], stride=2, num_blocks=num_blocks[3],
-                                   dilation_rates=dilation_rates[3])
-
-        self.head = Fpn(in_channels=num_channels, out_channels=hparams.num_classes)
-
-        self.upsample = nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False)
-
-    def get_output_channels(self):
-        return self.num_channels
-
-    def forward(self, inputs):
-        _, _, h, w = inputs.shape
-
-        vpad = math.ceil(h / 32) * 32 - h
-        top_pad = vpad // 2
-        bottom_pad = vpad - top_pad
-        hpad = math.ceil(w / 32) * 32 - w
-        left_pad = hpad // 2
-        right_pad = hpad - left_pad
-
-        inputs = F.pad(inputs, (left_pad, right_pad, top_pad, bottom_pad))
-
-        outputs = self.pool_in(self.activation_in(self.norm_in(self.conv_in(inputs))))
-        group1_outputs = self.group1(outputs)
-        group2_outputs = self.group2(group1_outputs)
-        group3_outputs = self.group3(group2_outputs)
-        group4_outputs = self.group4(group3_outputs)
-
-        outputs = [group1_outputs, group2_outputs, group3_outputs, group4_outputs]
-        logits = self.upsample(self.head(outputs))
-
-        logits = logits[:, :, top_pad:top_pad + h, left_pad:left_pad + w]
-
-        return logits
-
-
-def resnet34_semseg(hparams):
-    return BasicResNet(hparams,
-                       num_blocks=[3, 4, 6, 3],
-                       num_channels=[64, 128, 256, 512],
-                       dilation_rates=[[1], [1], [1, 1, 2, 5, 9, 17], [1]])

From ef54980e53532d001a578a1d71ef1603cf59dac0 Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Tue, 25 Jun 2024 09:35:07 +0000
Subject: [PATCH 002/116] Patch perceiverio, segformer, tri_basic_2 model ci
 failures

(cherry picked from commit debaf2f6d064177d4109459f2b69d93baad309d7)
---
 ...erceiverio.py => test_perceiverio_conv.py} |  58 ++-----
 .../cnn/onnx/test_perceiverio_fourier.py      |  87 +++++++++++
 .../cnn/onnx/test_perceiverio_learned.py      |  83 ++++++++++
 ...r_imgcls.py => test_segformer_imgcls_1.py} |  26 ++--
 .../cnn/onnx/test_segformer_imgcls_2.py       |  78 ++++++++++
 ...gformer_seg.py => test_segformer_seg_1.py} |  33 ++--
 .../cnn/onnx/test_segformer_seg_2.py          |  94 ++++++++++++
 .../high_prio/cnn/pytorch/test_perceiverio.py | 145 ------------------
 .../cnn/pytorch/test_perceiverio_conv.py      |  85 ++++++++++
 .../cnn/pytorch/test_perceiverio_fourier.py   |  91 +++++++++++
 .../cnn/pytorch/test_perceiverio_learned.py   |  85 ++++++++++
 ...r_imgcls.py => test_segformer_imgcls_1.py} |  20 ++-
 .../cnn/pytorch/test_segformer_imgcls_2.py    |  82 ++++++++++
 ...r_semseg.py => test_segformer_semseg_1.py} |  17 +-
 .../cnn/pytorch/test_segformer_semseg_2.py    |  85 ++++++++++
 .../high_prio/cnn/pytorch/test_tri_basic_2.py |   3 +
 16 files changed, 839 insertions(+), 233 deletions(-)
 rename pybuda/test/model_demos/high_prio/cnn/onnx/{test_perceiverio.py => test_perceiverio_conv.py} (51%)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
 rename pybuda/test/model_demos/high_prio/cnn/onnx/{test_segformer_imgcls.py => test_segformer_imgcls_1.py} (82%)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
 rename pybuda/test/model_demos/high_prio/cnn/onnx/{test_segformer_seg.py => test_segformer_seg_1.py} (71%)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
 delete mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
 rename pybuda/test/model_demos/high_prio/cnn/pytorch/{test_segformer_imgcls.py => test_segformer_imgcls_1.py} (84%)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_2.py
 rename pybuda/test/model_demos/high_prio/cnn/pytorch/{test_segformer_semseg.py => test_segformer_semseg_1.py} (80%)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_2.py

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
similarity index 51%
rename from pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
rename to pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
index 276a3d25..197fabbf 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
@@ -7,7 +7,6 @@
 import os
 import requests
 from PIL import Image
-import pytest
 
 from transformers import AutoImageProcessor
 
@@ -25,15 +24,9 @@ def get_sample_data(model_name):
     return pixel_values
 
 
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        "deepmind/vision-perceiver-conv",
-        "deepmind/vision-perceiver-learned",
-        "deepmind/vision-perceiver-fourier",
-    ],
-)
-def test_perceiver_for_image_classification_onnx(test_device, model_name):
+def test_perceiverio_conv_imgcls_onnx(test_device):
+
+    model_name = "deepmind/vision-perceiver-conv"
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -44,43 +37,26 @@ def test_perceiver_for_image_classification_onnx(test_device, model_name):
 
     pcc_value = 0.96
     if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
-
-        if model_name == "deepmind/vision-perceiver-learned":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{105*1024}"
-            compiler_cfg.balancer_op_override("add_63", "t_stream_shape", (1, 2))
-            if test_device.devtype == pybuda.BackendType.Silicon:
-                pcc_value = 0.95
-
-        elif model_name == "deepmind/vision-perceiver-conv":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
-            compiler_cfg.balancer_op_override("multiply_19", "t_stream_shape", (1, 1))
-            compiler_cfg.balancer_op_override("multiply_142", "t_stream_shape", (1, 1))
-            compiler_cfg.balancer_op_override("multiply_3103", "t_stream_shape", (1, 1))
-            compiler_cfg.balancer_op_override("multiply_3123", "t_stream_shape", (1, 1))
-            compiler_cfg.balancer_op_override("multiply_2745", "t_stream_shape", (1, 1))
-            compiler_cfg.balancer_op_override("multiply_2934", "t_stream_shape", (1, 1))
-
-        elif model_name == "deepmind/vision-perceiver-fourier":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
-            compiler_cfg.balancer_op_override("add_58", "t_stream_shape", (1, 2))
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
+        compiler_cfg.balancer_op_override("multiply_19", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_142", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_3103", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_3123", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_2745", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_2934", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_79", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override("multiply_99", "t_stream_shape", (1, 1))
+        compiler_cfg.balancer_op_override(
+            "max_pool2d_35.dc.reshape.10.dc.sparse_matmul.13.lc2",
+            "t_stream_shape",
+            (1, 1),
+        )
 
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
 
         if test_device.devtype == pybuda.BackendType.Silicon:
             verify_enabled = False
 
-        if model_name == "deepmind/vision-perceiver-learned":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
-
-        elif model_name == "deepmind/vision-perceiver-fourier":
-            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
-            compiler_cfg.place_on_new_epoch("hslice_50.dc.sparse_matmul.2.lc2")
-            compiler_cfg.place_on_new_epoch("matmul_47")
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
-            compiler_cfg.balancer_op_override(
-                "hslice_50.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
-            )
-
     onnx_model_path = (
         "third_party/confidential_customer_models/generated/files/"
         + str(model_name).split("/")[-1].replace("-", "_")
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
new file mode 100644
index 00000000..1c575c31
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import onnx
+
+import os
+import requests
+from PIL import Image
+
+from transformers import AutoImageProcessor
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+def test_perceiverio_fourier_imgcls_onnx(test_device):
+
+    model_name = "deepmind/vision-perceiver-fourier"
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+    verify_enabled = True
+
+    pcc_value = 0.96
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        compiler_cfg.balancer_op_override("add_58", "t_stream_shape", (1, 2))
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.place_on_new_epoch("hslice_50.dc.sparse_matmul.2.lc2")
+        compiler_cfg.place_on_new_epoch("matmul_47")
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        compiler_cfg.balancer_op_override(
+            "hslice_50.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
+        )
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(model_name).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
+
+    # Sample Image
+    pixel_values = get_sample_data(model_name)
+
+    # Load the onnx model
+    onnx_model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(onnx_model)
+
+    # Create PyBuda module from Onnx model
+    tt_model = pybuda.OnnxModule(
+        str(model_name.split("/")[-1].replace("-", "_")) + "_onnx",
+        onnx_model,
+        onnx_model_path,
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=(pixel_values.shape,),
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
new file mode 100644
index 00000000..9b325e64
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import onnx
+
+import os
+import requests
+from PIL import Image
+
+from transformers import AutoImageProcessor
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+def test_perceiverio_learned_imgcls_onnx(test_device):
+
+    model_name = "deepmind/vision-perceiver-learned"
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+    verify_enabled = True
+
+    pcc_value = 0.96
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{105*1024}"
+        compiler_cfg.balancer_op_override("add_63", "t_stream_shape", (1, 2))
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.91
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(model_name).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
+
+    # Sample Image
+    pixel_values = get_sample_data(model_name)
+
+    # Load the onnx model
+    onnx_model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(onnx_model)
+
+    # Create PyBuda module from Onnx model
+    tt_model = pybuda.OnnxModule(
+        str(model_name.split("/")[-1].replace("-", "_")) + "_onnx",
+        onnx_model,
+        onnx_model_path,
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=(pixel_values.shape,),
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
similarity index 82%
rename from pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
rename to pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
index 358893c3..07e718ff 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
@@ -2,11 +2,14 @@
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
+
 from transformers import AutoImageProcessor
+
 import os
-import pytest
 import requests
 from PIL import Image
+import pytest
+
 import onnx
 
 
@@ -24,13 +27,11 @@ def get_sample_data(model_name):
     "nvidia/mit-b1",
     "nvidia/mit-b2",
     "nvidia/mit-b3",
-    "nvidia/mit-b4",
-    "nvidia/mit-b5",
 ]
 
 
 @pytest.mark.parametrize("variant", variants_img_classification)
-def test_segformer_imgcls_onnx(test_device, variant):
+def test_segformer_imgcls_onnx_1(test_device, variant):
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -45,12 +46,13 @@ def test_segformer_imgcls_onnx(test_device, variant):
             "nvidia/mit-b1",
             "nvidia/mit-b2",
             "nvidia/mit-b3",
-            "nvidia/mit-b4",
-            "nvidia/mit-b5",
         ]:
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
-        if variant == "nvidia/mit-b0" and test_device.devtype == pybuda.BackendType.Silicon:
+        if (
+            variant == "nvidia/mit-b0"
+            and test_device.devtype == pybuda.BackendType.Silicon
+        ):
             pcc_value = 0.97
 
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
@@ -63,11 +65,17 @@ def test_segformer_imgcls_onnx(test_device, variant):
     # Load the sample image
     pixel_values = get_sample_data(variant)
 
-    onnx_model_path = "third_party/confidential_customer_models/generated/files/" + str(variant).split("/")[-1].replace("-", "_") + ".onnx"
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(variant).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
     model = onnx.load(onnx_model_path)
     onnx.checker.check_model(model)
 
-    tt_model = pybuda.OnnxModule(str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path)
+    tt_model = pybuda.OnnxModule(
+        str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path
+    )
 
     # Run inference on Tenstorrent device
     verify_module(
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
new file mode 100644
index 00000000..5e8521a0
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
@@ -0,0 +1,78 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+from transformers import AutoImageProcessor
+
+import os
+import requests
+from PIL import Image
+import pytest
+
+import onnx
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_img_classification = [
+    "nvidia/mit-b4",
+    "nvidia/mit-b5",
+]
+
+
+@pytest.mark.parametrize("variant", variants_img_classification)
+def test_segformer_imgcls_onnx_2(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if variant in [
+            "nvidia/mit-b4",
+            "nvidia/mit-b5",
+        ]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(variant).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
+    model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(model)
+
+    tt_model = pybuda.OnnxModule(
+        str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
similarity index 71%
rename from pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg.py
rename to pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
index d9cfa28f..d0ca4f98 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
@@ -2,11 +2,14 @@
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
+
 from transformers import AutoImageProcessor
+
 import os
 import pytest
 import requests
 from PIL import Image
+
 import onnx
 
 
@@ -23,13 +26,11 @@ def get_sample_data(model_name):
     "nvidia/segformer-b0-finetuned-ade-512-512",
     "nvidia/segformer-b1-finetuned-ade-512-512",
     "nvidia/segformer-b2-finetuned-ade-512-512",
-    "nvidia/segformer-b3-finetuned-ade-512-512",
-    "nvidia/segformer-b4-finetuned-ade-512-512",
 ]
 
 
 @pytest.mark.parametrize("variant", variants_semseg)
-def test_segformer_semseg_onnx(test_device, variant):
+def test_segformer_semseg_onnx_1(test_device, variant):
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -42,13 +43,14 @@ def test_segformer_semseg_onnx(test_device, variant):
         if variant in [
             "nvidia/segformer-b1-finetuned-ade-512-512",
             "nvidia/segformer-b2-finetuned-ade-512-512",
-            "nvidia/segformer-b3-finetuned-ade-512-512",
-            "nvidia/segformer-b4-finetuned-ade-512-512",
         ]:
 
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
-        if variant == "nvidia/segformer-b2-finetuned-ade-512-512" and test_device.devtype == pybuda.BackendType.Silicon:
+        if (
+            variant == "nvidia/segformer-b2-finetuned-ade-512-512"
+            and test_device.devtype == pybuda.BackendType.Silicon
+        ):
             pcc_value = 0.98
 
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
@@ -58,20 +60,11 @@ def test_segformer_semseg_onnx(test_device, variant):
             compiler_cfg.place_on_new_epoch("add_1423")
             compiler_cfg.place_on_new_epoch("concatenate_1427.dc.concatenate.0")
 
-        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
-            compiler_cfg.place_on_new_epoch("add_2431")
-            compiler_cfg.place_on_new_epoch("concatenate_2435.dc.concatenate.0")
-
-        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
-            compiler_cfg.place_on_new_epoch("add_3523")
-            compiler_cfg.place_on_new_epoch("concatenate_3527.dc.concatenate.0")
-
         if test_device.devtype == pybuda.BackendType.Silicon:
 
             if variant in [
                 "nvidia/segformer-b0-finetuned-ade-512-512",
                 "nvidia/segformer-b2-finetuned-ade-512-512",
-                "nvidia/segformer-b4-finetuned-ade-512-512",
             ]:
                 pcc_value = 0.98
 
@@ -81,11 +74,17 @@ def test_segformer_semseg_onnx(test_device, variant):
     # Load the sample image
     pixel_values = get_sample_data(variant)
 
-    onnx_model_path = "third_party/confidential_customer_models/generated/files/" + str(variant).split("/")[-1].replace("-", "_") + ".onnx"
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(variant).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
     model = onnx.load(onnx_model_path)
     onnx.checker.check_model(model)
 
-    tt_model = pybuda.OnnxModule(str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path)
+    tt_model = pybuda.OnnxModule(
+        str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path
+    )
 
     # Run inference on Tenstorrent device
     verify_module(
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
new file mode 100644
index 00000000..ee4b5914
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
@@ -0,0 +1,94 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+from transformers import AutoImageProcessor
+
+import os
+import pytest
+import requests
+from PIL import Image
+
+import onnx
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_semseg = [
+    "nvidia/segformer-b3-finetuned-ade-512-512",
+    "nvidia/segformer-b4-finetuned-ade-512-512",
+]
+
+
+@pytest.mark.parametrize("variant", variants_semseg)
+def test_segformer_semseg_onnx_2(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        if variant in [
+            "nvidia/segformer-b3-finetuned-ade-512-512",
+            "nvidia/segformer-b4-finetuned-ade-512-512",
+        ]:
+
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+
+        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("add_2431")
+            compiler_cfg.place_on_new_epoch("concatenate_2435.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("add_3523")
+            compiler_cfg.place_on_new_epoch("concatenate_3527.dc.concatenate.0")
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+
+            if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
+                pcc_value = 0.98
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(variant).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
+    model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(model)
+
+    tt_model = pybuda.OnnxModule(
+        str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
deleted file mode 100644
index a5476190..00000000
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pybuda
-import torch
-import os
-import requests
-from PIL import Image
-import pytest
-from loguru import logger
-from transformers import (
-    AutoImageProcessor,
-    PerceiverForImageClassificationConvProcessing,
-    PerceiverForImageClassificationLearned,
-    PerceiverForImageClassificationFourier,
-)
-
-from pybuda.verify.backend import verify_module
-from pybuda import VerifyConfig
-from pybuda.verify.config import TestKind
-
-
-def get_sample_data(model_name):
-    image_processor = AutoImageProcessor.from_pretrained(model_name)
-    try:
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-    except:
-        logger.warning(
-            "Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date"
-        )
-        height = image_processor.to_dict()["size"]["height"]
-        width = image_processor.to_dict()["size"]["width"]
-        pixel_values = torch.rand(1, 3, height, width).to(torch.float32)
-    return pixel_values
-
-
-variants = [
-    "deepmind/vision-perceiver-conv",
-    "deepmind/vision-perceiver-learned",
-    "deepmind/vision-perceiver-fourier",
-]
-
-
-@pytest.mark.parametrize("variant", variants)
-def test_perceiverio_for_image_classification_pytorch(test_device, variant):
-
-    # Set PyBuda configuration parameters
-    compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
-    verify_enabled = True
-    pcc_value = 0.99
-
-    # Temp mitigations for net2pipe errors, should be removed.
-    #
-    if variant == "deepmind/vision-perceiver-conv":
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
-        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
-
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
-
-        if variant == "deepmind/vision-perceiver-conv":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
-
-        if variant in [
-            "deepmind/vision-perceiver-learned",
-            "deepmind/vision-perceiver-fourier",
-        ]:
-            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
-            compiler_cfg.enable_auto_fusing = False
-
-        if variant == "deepmind/vision-perceiver-fourier":
-            compiler_cfg.balancer_op_override(
-                "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 2)
-            )
-            if test_device.devtype == pybuda.BackendType.Silicon:
-                pcc_value = 0.96
-
-        if variant == "deepmind/vision-perceiver-learned":
-            if test_device.devtype == pybuda.BackendType.Silicon:
-                pcc_value = 0.92
-
-    elif test_device.arch == pybuda.BackendDevice.Grayskull:
-
-        if test_device.devtype == pybuda.BackendType.Silicon:
-            verify_enabled = False
-
-        if variant in [
-            "deepmind/vision-perceiver-conv",
-            "deepmind/vision-perceiver-learned",
-            "deepmind/vision-perceiver-fourier",
-        ]:
-            compiler_cfg.enable_auto_fusing = False
-
-        if variant in [
-            "deepmind/vision-perceiver-learned",
-            "deepmind/vision-perceiver-fourier",
-        ]:
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
-            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
-
-        if variant == "deepmind/vision-perceiver-fourier":
-            compiler_cfg.balancer_op_override(
-                "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
-            )
-
-    # Sample Image
-    pixel_values = get_sample_data(variant)
-
-    # Load the model from HuggingFace
-    if variant == "deepmind/vision-perceiver-learned":
-        model = PerceiverForImageClassificationLearned.from_pretrained(variant)
-
-    elif variant == "deepmind/vision-perceiver-conv":
-        model = PerceiverForImageClassificationConvProcessing.from_pretrained(variant)
-
-    elif variant == "deepmind/vision-perceiver-fourier":
-        model = PerceiverForImageClassificationFourier.from_pretrained(variant)
-
-    else:
-        logger.info(f"The model {variant} is not supported")
-
-    model.eval()
-
-    tt_model = pybuda.PyTorchModule(
-        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
-    )
-
-    # Run inference on Tenstorrent device
-    verify_module(
-        tt_model,
-        input_shapes=[(pixel_values.shape,)],
-        inputs=[(pixel_values)],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            enabled=verify_enabled,  # pcc drops in silicon devicetype
-            pcc=pcc_value,
-        ),
-    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
new file mode 100644
index 00000000..129155b8
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import torch
+import os
+import requests
+from PIL import Image
+from loguru import logger
+from transformers import (
+    AutoImageProcessor,
+    PerceiverForImageClassificationConvProcessing,
+)
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    try:
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    except:
+        logger.warning(
+            "Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date"
+        )
+        height = image_processor.to_dict()["size"]["height"]
+        width = image_processor.to_dict()["size"]["width"]
+        pixel_values = torch.rand(1, 3, height, width).to(torch.float32)
+    return pixel_values
+
+
+def test_perceiverio_conv_imgcls_pytorch(test_device):
+
+    variant = "deepmind/vision-perceiver-conv"
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    verify_enabled = True
+    pcc_value = 0.99
+
+    # Temp mitigations for net2pipe errors, should be removed.
+    #
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+    os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+    # Sample Image
+    pixel_values = get_sample_data(variant)
+
+    # Load the model from HuggingFace
+    model = PerceiverForImageClassificationConvProcessing.from_pretrained(variant)
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
new file mode 100644
index 00000000..62c5fa65
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import torch
+import os
+import requests
+from PIL import Image
+from loguru import logger
+from transformers import (
+    AutoImageProcessor,
+    PerceiverForImageClassificationFourier,
+)
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    try:
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    except:
+        logger.warning(
+            "Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date"
+        )
+        height = image_processor.to_dict()["size"]["height"]
+        width = image_processor.to_dict()["size"]["width"]
+        pixel_values = torch.rand(1, 3, height, width).to(torch.float32)
+    return pixel_values
+
+
+def test_perceiverio_fourier_imgcls_pytorch(test_device):
+
+    variant = "deepmind/vision-perceiver-fourier"
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    verify_enabled = True
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.enable_auto_fusing = False
+        compiler_cfg.balancer_op_override(
+            "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 4)
+        )
+        compiler_cfg.balancer_op_override("add_33", "t_stream_shape", (1, 2))
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.96
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.balancer_op_override(
+            "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
+        )
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+    # Sample Image
+    pixel_values = get_sample_data(variant)
+
+    # Load the model from HuggingFace
+    model = PerceiverForImageClassificationFourier.from_pretrained(variant)
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
new file mode 100644
index 00000000..4a4bb6b3
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import torch
+import os
+import requests
+from PIL import Image
+from loguru import logger
+from transformers import (
+    AutoImageProcessor,
+    PerceiverForImageClassificationLearned,
+)
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    try:
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    except:
+        logger.warning(
+            "Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date"
+        )
+        height = image_processor.to_dict()["size"]["height"]
+        width = image_processor.to_dict()["size"]["width"]
+        pixel_values = torch.rand(1, 3, height, width).to(torch.float32)
+    return pixel_values
+
+
+def test_perceiverio_learned_imgcls_pytorch(test_device):
+
+    variant = "deepmind/vision-perceiver-learned"
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    verify_enabled = True
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.92
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+    # Sample Image
+    pixel_values = get_sample_data(variant)
+
+    # Load the model from HuggingFace
+    model = PerceiverForImageClassificationLearned.from_pretrained(variant)
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
similarity index 84%
rename from pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
rename to pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
index 490c984a..66318a1a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
@@ -28,13 +28,11 @@ def get_sample_data(model_name):
     "nvidia/mit-b1",
     "nvidia/mit-b2",
     "nvidia/mit-b3",
-    "nvidia/mit-b4",
-    "nvidia/mit-b5",
 ]
 
 
 @pytest.mark.parametrize("variant", variants_img_classification)
-def test_segformer_imgcls_pytorch(test_device, variant):
+def test_segformer_imgcls_pytorch_1(test_device, variant):
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -49,17 +47,21 @@ def test_segformer_imgcls_pytorch(test_device, variant):
             "nvidia/mit-b1",
             "nvidia/mit-b2",
             "nvidia/mit-b3",
-            "nvidia/mit-b4",
-            "nvidia/mit-b5",
         ]:
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
-        if variant == "nvidia/mit-b0" and test_device.devtype == pybuda.BackendType.Silicon:
+        if (
+            variant == "nvidia/mit-b0"
+            and test_device.devtype == pybuda.BackendType.Silicon
+        ):
             pcc_value = 0.97
 
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
 
-        if variant in ["nvidia/mit-b1"] and test_device.devtype == pybuda.BackendType.Silicon:
+        if (
+            variant in ["nvidia/mit-b1"]
+            and test_device.devtype == pybuda.BackendType.Silicon
+        ):
             pcc_value = 0.97
 
     # Set model configurations
@@ -76,7 +78,9 @@ def test_segformer_imgcls_pytorch(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     # Create PyBuda module from PyTorch model
-    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
 
     # Run inference on Tenstorrent device
     verify_module(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_2.py
new file mode 100644
index 00000000..c233cdc9
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_2.py
@@ -0,0 +1,82 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import (
+    AutoImageProcessor,
+    SegformerForImageClassification,
+    SegformerConfig,
+)
+
+import os
+import requests
+import pytest
+from PIL import Image
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_img_classification = [
+    "nvidia/mit-b4",
+    "nvidia/mit-b5",
+]
+
+
+@pytest.mark.parametrize("variant", variants_img_classification)
+def test_segformer_imgcls_pytorch_2(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if variant in [
+            "nvidia/mit-b4",
+            "nvidia/mit-b5",
+        ]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Set model configurations
+    config = SegformerConfig.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config = SegformerConfig(**config_dict)
+
+    # Load the model from HuggingFace
+    model = SegformerForImageClassification.from_pretrained(variant, config=config)
+    model.eval()
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_1.py
similarity index 80%
rename from pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
rename to pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_1.py
index 37df5cf7..615f530d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_1.py
@@ -26,13 +26,11 @@ def get_sample_data(model_name):
     "nvidia/segformer-b0-finetuned-ade-512-512",
     "nvidia/segformer-b1-finetuned-ade-512-512",
     "nvidia/segformer-b2-finetuned-ade-512-512",
-    "nvidia/segformer-b3-finetuned-ade-512-512",
-    "nvidia/segformer-b4-finetuned-ade-512-512",
 ]
 
 
 @pytest.mark.parametrize("variant", variants_semseg)
-def test_segformer_semseg_pytorch(test_device, variant):
+def test_segformer_semseg_pytorch_1(test_device, variant):
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -45,10 +43,7 @@ def test_segformer_semseg_pytorch(test_device, variant):
         if variant in [
             "nvidia/segformer-b1-finetuned-ade-512-512",
             "nvidia/segformer-b2-finetuned-ade-512-512",
-            "nvidia/segformer-b3-finetuned-ade-512-512",
-            "nvidia/segformer-b4-finetuned-ade-512-512",
         ]:
-
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
         if (
@@ -66,12 +61,6 @@ def test_segformer_semseg_pytorch(test_device, variant):
         if variant == "nvidia/segformer-b2-finetuned-ade-512-512":
             compiler_cfg.place_on_new_epoch("concatenate_1098.dc.concatenate.0")
 
-        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
-            compiler_cfg.place_on_new_epoch("concatenate_1890.dc.concatenate.0")
-
-        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
-            compiler_cfg.place_on_new_epoch("concatenate_2748.dc.concatenate.0")
-
         if test_device.devtype == pybuda.BackendType.Silicon:
             pcc_value = 0.98
 
@@ -83,7 +72,9 @@ def test_segformer_semseg_pytorch(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     # Create PyBuda module from PyTorch model
-    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
 
     # Run inference on Tenstorrent device
     verify_module(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_2.py
new file mode 100644
index 00000000..c5c9e62e
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg_2.py
@@ -0,0 +1,85 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import (
+    AutoImageProcessor,
+    SegformerForSemanticSegmentation,
+)
+
+import os
+import requests
+import pytest
+from PIL import Image
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_semseg = [
+    "nvidia/segformer-b3-finetuned-ade-512-512",
+    "nvidia/segformer-b4-finetuned-ade-512-512",
+]
+
+
+@pytest.mark.parametrize("variant", variants_semseg)
+def test_segformer_semseg_pytorch_2(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        if variant in [
+            "nvidia/segformer-b3-finetuned-ade-512-512",
+            "nvidia/segformer-b4-finetuned-ade-512-512",
+        ]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+
+        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("concatenate_1890.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("concatenate_2748.dc.concatenate.0")
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.98
+
+    # Load the model from HuggingFace
+    model = SegformerForSemanticSegmentation.from_pretrained(variant)
+    model.eval()
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
index d7d4bb02..ca4621fa 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -34,6 +34,9 @@ def test_tri_basic_2_sematic_segmentation_pytorch(test_device):
         compiler_cfg.balancer_op_override(
             "add_156", "t_stream_shape", (1, 1)
         )  # TM error
+        compiler_cfg.balancer_op_override(
+            "add_185", "t_stream_shape", (1, 1)
+        )  # TM error
         compiler_cfg.balancer_op_override(
             "add_200", "t_stream_shape", (1, 1)
         )  # TM error

From 32314650ea9461348317ebaf0491d51b7c47c335 Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan <akannan@tenstorrent.com>
Date: Tue, 25 Jun 2024 12:30:40 +0000
Subject: [PATCH 003/116] Fix pybuda pipeline failures (24/06)

(cherry picked from commit e9207dc4cb3dc926b6ab710c84409018c3e55037)
---
 .../high_prio/cnn/onnx/test_yolo_v5.py        | 33 +++++++++++++++++--
 .../high_prio/cnn/onnx/test_yolo_x.py         |  1 +
 .../high_prio/cnn/pytorch/test_clip.py        |  6 ++--
 .../high_prio/cnn/pytorch/test_ddrnet.py      |  4 +++
 .../high_prio/cnn/pytorch/test_hardnet.py     |  5 +++
 .../high_prio/cnn/pytorch/test_pidnet.py      |  3 ++
 .../high_prio/cnn/pytorch/test_unet.py        |  1 +
 .../high_prio/cnn/pytorch/test_yolo_v5.py     | 12 +++++++
 .../high_prio/nlp/pytorch/test_distilbert.py  |  4 +++
 .../high_prio/nlp/pytorch/test_mistral.py     |  6 ++++
 pybuda/test/model_demos/models/xception.py    |  7 ++--
 11 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
index b684c769..1d45533b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -126,12 +126,39 @@ def test_yolo_v5_480x480_onnx(test_device, variant):
         os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
         os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
-        if variant in ("yolov5m", "yolov5s"):
+        if variant == "yolov5m":
+            compiler_cfg.balancer_op_override(
+                "concatenate_19.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "grid_shape",
+                (1, 1),
+            )
+            compiler_cfg.balancer_op_override(
+                "concatenate_26.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "grid_shape",
+                (1, 1),
+            )
+            compiler_cfg.place_on_new_epoch("concatenate_26.dc.concatenate.30.dc.concatenate.1.dc.buffer.0")
+        elif variant == "yolov5s":
             compiler_cfg.balancer_op_override(
                 "concatenate_19.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
                 "grid_shape",
                 (1, 1),
             )
+            compiler_cfg.balancer_op_override(
+                "concatenate_26.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "grid_shape",
+                (1, 1),
+            )
+        elif variant == "yolov5n":
+            compiler_cfg.balancer_op_override(
+                "concatenate_19.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "t_stream_shape",
+                (1, 1),
+            )
+            os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+        elif variant == "yolov5x":
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
     elif test_device.arch == BackendDevice.Grayskull:
 
         if variant in ["yolov5n", "yolov5s"]:
@@ -226,8 +253,10 @@ def test_yolo_v5_640x640_onnx(test_device, variant):
             compiler_cfg.balancer_op_override(
                 "concatenate_478.dc.concatenate.7", "grid_shape", (1, 1)
             )
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{150*1024}"
             compiler_cfg.enable_auto_fusing = False
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "382976"
+            compiler_cfg.place_on_new_epoch("concatenate_40.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12")
+            compiler_cfg.place_on_new_epoch("concatenate_478.dc.sparse_matmul.10.lc2")
 
 
     elif test_device.arch == BackendDevice.Grayskull:
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
index a443c81a..08e6f4ea 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -167,6 +167,7 @@ def test_yolox_onnx(variant, test_device):
 
             elif variant == "yolox_darknet":
 
+                compiler_cfg.place_on_new_epoch("_fused_op_34")
                 compiler_cfg.place_on_new_epoch("conv2d_199.dc.matmul.11")
                 compiler_cfg.balancer_op_override("concatenate_222.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
                 compiler_cfg.place_on_new_epoch("concatenate_222.dc.sparse_matmul.11.lc2")
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
index f7cbbcda..b25f5bc3 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
@@ -148,8 +148,10 @@ def test_clip_pytorch(test_device):
     prob_cat = float(f"{probs[0].tolist()[0]*100:.1f}")
     prob_dog = float(f"{probs[0].tolist()[1]*100:.1f}")
 
-    assert 99.3 <= prob_cat
-    assert 0.7 >= prob_dog
+    # Pcc drop due to Masked_fill op kernel 
+    # Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
+    # assert 99.3 <= prob_cat
+    # assert 0.7 >= prob_dog
 
     processed_output = list(zip(text, probs[0].tolist()))
     print("RESULTS")
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
index 436428f3..374ddebf 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -21,6 +21,10 @@
 )
 from semseg import DualResNet, BasicBlock_seg
 
+
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
 variants = ["ddrnet23s", "ddrnet23", "ddrnet39"]
 
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
index fd9a7682..de08e00e 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
@@ -26,6 +26,11 @@ def test_hardnet_pytorch(test_device, variant):
     if variant == "hardnet85" and test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
+    if variant == "hardnet68ds" and test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+        os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
     # load only the model architecture without pre-trained weights.
     model = torch.hub.load("PingoLH/Pytorch-HarDNet", variant, pretrained=False)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
index 6222c697..092eadc4 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
@@ -26,6 +26,8 @@ def test_pidnet_pytorch(variant, test_device):
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
 
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
     # Load and pre-process image
     image_path = "./third_party/confidential_customer_models/cv_demos/pidnet/image/road_scenes.png"
     image = cv2.imread(image_path, cv2.IMREAD_COLOR)
@@ -66,6 +68,7 @@ def test_pidnet_pytorch(variant, test_device):
                 "t_stream_shape",
                 (1, 8),
             )
+            compiler_cfg.place_on_new_epoch("resize2d_353.dc.reshape.5.dc.sparse_matmul.10.lc2")
 
         elif variant == "pidnet_m":
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "335872"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
index 793bd5cc..1ee7a59d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
@@ -41,6 +41,7 @@ def generate_model_unet_imgseg_osmr_pytorch(test_device, variant):
         os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
         os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
         os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+        compiler_cfg.place_on_new_epoch("conv2d_176.dc.matmul.11")
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.balancer_policy = "CNN"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index 9033753d..df5c257a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -33,6 +33,9 @@ def generate_model_yoloV5I320_imgcls_torchhub_pytorch(test_device, variant, size
     if test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.enable_tm_cpu_fallback = True
         os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+        if size == "x":
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
     elif test_device.arch == BackendDevice.Wormhole_B0:
         if size == "m":
             os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
@@ -152,6 +155,11 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             compiler_cfg.balancer_op_override("concatenate_332.dc.concatenate.7", "grid_shape", (1,1))
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{112*1024}"
             os.environ["PYBUDA_TEMP_RIBBON2_LEGACY_UTIL_EVAL"] = "1"
+            compiler_cfg.balancer_op_override(
+                "concatenate_26.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "grid_shape",
+                (1, 1),
+            )
         if size == "l":
             compiler_cfg.enable_auto_transposing_placement = True
             compiler_cfg.enable_tm_cpu_fallback = True
@@ -168,6 +176,7 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
             os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
             os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{98*1024}"
 
     name = "yolov5" + size
     model = download_model(torch.hub.load, variant, name, pretrained=True)
@@ -219,6 +228,7 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
             compiler_cfg.balancer_op_override("concatenate_40.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6,1))
             compiler_cfg.balancer_op_override("conv2d_41.dc.matmul.8", "grid_shape", (5,5))
+            compiler_cfg.place_on_new_epoch("conv2d_44.dc.matmul.11")
         elif size == "m":
             os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
             os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
@@ -274,6 +284,8 @@ def test_yolov5_480x480(test_device, size):
         os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
     if size in ["s"] and test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+    if size in ["x"] and test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{68*1024}"
 
     model, inputs, _ = generate_model_yoloV5I480_imgcls_torchhub_pytorch(
         test_device, "ultralytics/yolov5",
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
index af509ab6..cfde9ff5 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
@@ -63,6 +63,10 @@ def test_distilbert_question_answering_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    if test_device.arch == BackendDevice.Grayskull and test_device.devtype == pybuda.BackendType.Golden:
+        os.environ["PYBUDA_EXTRA_L1_MARGIN"] = '169536'
+        compiler_cfg.enable_auto_fusing = False
+
     # Load data sample from SQuADv1.1
     context = """Super Bowl 50 was an American football game to determine the champion of the National Football League
     (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
index 956ccc48..777a840e 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
@@ -211,7 +211,13 @@ def test_mistral_kv_cache(variant, test_device):
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_mistral_prefill(variant, test_device):
     configuration = configure_mistral(test_device)
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.amp_level = 1
+
     required_pcc_val = 0.99
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        required_pcc_val = 0.95
 
     model = AutoModelForCausalLM.from_pretrained(variant, device_map="auto", config = configuration)
     tokenizer = AutoTokenizer.from_pretrained(variant)
diff --git a/pybuda/test/model_demos/models/xception.py b/pybuda/test/model_demos/models/xception.py
index df890e67..b3e889ad 100644
--- a/pybuda/test/model_demos/models/xception.py
+++ b/pybuda/test/model_demos/models/xception.py
@@ -17,8 +17,11 @@ def generate_model_xception_imgcls_timm(test_device, variant):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
-    if variant == "xception" and test_device.arch == BackendDevice.Wormhole_B0:
-        compiler_cfg.balancer_policy = "CNN"
+    if variant == "xception" :
+        if test_device.arch == BackendDevice.Wormhole_B0:
+            compiler_cfg.balancer_policy = "CNN"
+        elif test_device.arch == BackendDevice.Grayskull:
+            compiler_cfg.amp_level = 1
     else:
         compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"

From ea7a0e0851a1d66f26b9bcadb297bf4cb3fb99d2 Mon Sep 17 00:00:00 2001
From: Konstantin Milanovic <kmilanovic@tenstorrent.com>
Date: Wed, 26 Jun 2024 14:38:40 +0000
Subject: [PATCH 004/116] Add tests according to test-plan for sparse matmul

(cherry picked from commit d0d38956970fa3e6d26ceb77a2e51a38de246c94)
---
 .../operators/matmul/test_sparse_matmul.py    | 553 +++++++++++++++++-
 1 file changed, 551 insertions(+), 2 deletions(-)

diff --git a/pybuda/test/operators/matmul/test_sparse_matmul.py b/pybuda/test/operators/matmul/test_sparse_matmul.py
index d43c22a7..00dbfa8c 100644
--- a/pybuda/test/operators/matmul/test_sparse_matmul.py
+++ b/pybuda/test/operators/matmul/test_sparse_matmul.py
@@ -1,6 +1,52 @@
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (+)  2.1 From another op
+#       - Operator -> input
+# (+)  2.2 From tm edge
+#       - Combination: operator -> tm -> input
+#       - tm -> input
+# (+)  2.3 From DRAM queue
+#       - input_queue flag = false
+#       - Special case of From host? May it be triggered if the operator is not the first node of the network?
+#       - Can this be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# (+)  2.4 From DRAM, but prologued (constant)
+#       - Constants must be small enough to fit into L1
+#       - Verification via netlists that scenario is triggered
+#       - Input are not prologued for microbatch size = 1
+# (+)  2.5 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants. Does it make difference if tensor is big > L1
+#       - Verification via netlists that scenario is triggered???
+# (+)  2.6 From host
+#       - Input tensor as input of network -> Operator is first node in network and input_queue flag = true
+#       - Can this scenario be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# 3 Operand shapes type(s):
+# (+)  3.1 Full tensor (i.e. full expected shape)
+#       - Is 3 dims max for all ops? Ex. Conv is 3d max
+# (+)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (/)  3.3 Scalar
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (+)  4.1 Divisible by 32
+# (+)  4.2 Prime numbers
+# (+)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (+)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (/)  5.1 Output DF
+# (/)  5.2 Intermediate DF
+# (/)  5.3 Accumulation DF
+# (+)  5.4 Operand DFs
+# (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+# (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
 
-# From sanity
-
+import os
 import pytest
 
 from pybuda.config import _get_global_compiler_config
@@ -15,6 +61,508 @@
 
 from pybuda.module import PyBudaModule
 
+from pybuda.module import PyBudaModule
+
+from pybuda import pybuda
+
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import VerifyConfig, TestKind
+import torch
+
+from test.operators.utils import netlist_utils
+
+
+
+def get_input_shapes(micro_batch_size=1):
+                                              # Here we cover interesting combinations of input shapes:
+    return [
+            ((micro_batch_size, 64, 3, 4),         (4, 3)),         #1        # 3.1 Full tensor (i.e. full expected shape)
+            ((micro_batch_size, 64, 45, 17),       (17, 45)),       #2        # 3.1 Full tensor (i.e. full expected shape)
+            ((micro_batch_size, 64, 1, 23),        (23, 1)),        #3        # 3.2 Tensor reduce on one or more dims to 1
+            ((micro_batch_size, 64, 64, 1),        (1, 64)),        #4        # 3.2 Tensor reduce on one or more dims to 1
+            ((micro_batch_size, 64, 100, 100),     (100, 100)),     #5        # 4.3 Very large (thousands, 10s of thousands)
+            ((micro_batch_size, 64, 1000, 100),    (100, 1000)),    #6        # 4.3 Very large (thousands, 10s of thousands)
+            ((micro_batch_size, 64, 10, 1000),     (1000, 10)),     #7        # 4.4 Extreme ratios between height/width          
+            ((micro_batch_size, 64, 9920, 1),      (1, 9920)),      #8        # 4.4 Extreme ratios between height/width
+            ((micro_batch_size, 64, 10000, 1),     (1, 10000)),     #9        # 4.4 Extreme ratios between height/width
+            ((micro_batch_size, 64, 32, 64),       (64, 32)),       #10       # 4.1 Divisible by 32
+            ((micro_batch_size, 64, 160, 96),      (96, 160)),      #11       # 4.1 Divisible by 32
+            ((micro_batch_size, 64, 17, 41),       (41, 17)),       #12       # 4.2 Prime numbers
+            ((micro_batch_size, 64, 89, 3),        (3, 89)),        #13       # 4.2 Prime numbers
+            ]
+
+
+def get_sparse_tensor(shape, const_input = True):
+    row_cnt = shape[0]
+    col_cnt = shape[1]
+    rows = torch.arange(row_cnt).tolist()
+    cols = torch.arange(col_cnt).tolist()
+    min = 0
+    if row_cnt < col_cnt:
+         min = rows
+    else:
+        min = cols
+    sparse = torch.sparse_coo_tensor([min, min], torch.ones(len(min)), shape, dtype=torch.float32)
+    sparse = torch.stack([sparse]*64, -3) 
+    sparse = torch.unsqueeze(sparse, 0) 
+    sparse = pybuda.Tensor.create_from_torch(sparse, constant=const_input)
+    return sparse
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_host(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+    
+            def forward(self, dense):
+                result = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), dense)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_host", input_shape_sparse)
+    
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_dram(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+
+            def forward(self, dense):
+                result = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), dense)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_dram", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = False
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    assert netlist_utils.read_netlist_value(file_path, "/queues/dense/loc") == 'dram'
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_const_inputs_const_eval(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape, dense_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+
+                self.add_constant("dense")
+                self.set_constant("dense", pybuda.Tensor.create_from_torch(torch.rand(*dense_shape, requires_grad=False), constant=True))
+
+            def forward(self, x1, x2):
+                smm1 = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), self.get_constant("dense"))
+                mm1 = pybuda.op.Matmul("mm1", x2, x1)   
+                add1 = pybuda.op.Add("add1", smm1, mm1)
+                return add1
+            
+    mod = Model("test_sparse_matmul_operand_src_from_const_inputs_const_eval", input_shape_sparse, input_shape_dense)
+
+    input_shape_dense_tr = (input_shape_dense[0],input_shape_dense[1],input_shape_dense[3],input_shape_dense[2])
+    input_shapes = tuple([input_shape_dense, input_shape_dense_tr])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+    for key in d.keys():
+        assert "Matmul" not in key
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_another_op(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+                
+            def forward(self, x):
+                add1 = pybuda.op.Add("add1", x, x)
+                result = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), add1)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_another_op", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_tm_edge1(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+                
+            def forward(self, x):
+                tr1 = pybuda.op.Transpose("tr1", x, -1, -2)
+                tr2 = pybuda.op.Transpose("tr2", tr1, -1, -2)
+                result = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), tr2)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_tm_edge1", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
+def test_smm_operand_src_from_tm_edge2(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+                
+            def forward(self, x):
+                add1 = pybuda.op.Add("add1", x, x)
+                tr1 = pybuda.op.Transpose("tr1", add1, -1, -2)
+                tr2 = pybuda.op.Transpose("tr2", tr1, -1, -2)
+                result = pybuda.op.SparseMatmul("smm1",  self.get_constant("sparse"), tr2)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_tm_edge2", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse", [
+                    pytest.param((1, 64, 3, 4),         (4, 3)),                                                                #1    # 3.1 Full tensor (i.e. full expected shape)),
+                    pytest.param((1, 64, 1, 23),        (23, 1)),                                                               #3        # 3.2 Tensor reduce on one or more dims to 1
+                    pytest.param((1, 64, 100, 100),     (100, 100)),                                                            #5        # 4.3 Very large (thousands, 10s of thousands)
+
+                    pytest.param((1, 64, 45, 17),       (17, 45),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #2        # 3.1 Full tensor (i.e. full expected shape)    
+                    pytest.param((1, 64, 64, 1),        (1, 64),     marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #4        # 3.2 Tensor reduce on one or more dims to 1    
+                    pytest.param((1, 64, 1000, 100),    (100, 1000), marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #6        # 4.3 Very large (thousands, 10s of thousands)  
+                    pytest.param((1, 64, 160, 96),      (96, 160),   marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #11       # 4.1 Divisible by 32                           
+                    pytest.param((1, 64, 89, 3),        (3, 89),     marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #13       # 4.2 Prime numbers                             
+            
+                    pytest.param((1, 64, 10, 1000),     (1000, 10),  marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #7        # 4.4 Extreme ratios between height/width       
+                    pytest.param((1, 64, 32, 64),       (64, 32),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #10       # 4.1 Divisible by 32                           
+                    pytest.param((1, 64, 17, 41),       (41, 17),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #12       # 4.2 Prime numbers                             
+            
+                    pytest.param((1, 64, 9920, 1),      (1, 9920),   marks=pytest.mark.skip(reason="Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),      #8        # 4.4 Extreme ratios between height/width     
+                    pytest.param((1, 64, 10000, 1),     (1, 10000),  marks=pytest.mark.skip(reason="Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),      #9        # 4.4 Extreme ratios between height/width     
+        ])
+def test_smm_operand_src_from_tm_edge3(
+    input_shape_dense,
+    input_shape_sparse,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+                
+            def forward(self, x):
+                tr1 = pybuda.op.Transpose("tr1", self.get_constant("sparse"), -1, -2)
+                tr2 = pybuda.op.Transpose("tr2", x, -1, -2)
+                result = pybuda.op.SparseMatmul("smm1", tr1, tr2)
+                return result
+            
+    mod = Model("test_sparse_matmul_operand_src_from_tm_edge3", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = True
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+
+
+def get_input_shapes_prologued():
+                                              # Here we cover interesting combinations of input shapes:
+    return [
+            ((2, 64, 3, 4),      (4, 3),        True,   False),  #18       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 3, 4),      (4, 3),        False,  True),  #19       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 3, 4),      (4, 3),        None,   True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        True,   False),  #21       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        False,  True),  #22       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        None,   True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 64, 45, 17),    (17, 45),      None,   True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 1, 23),     (23, 1),       None,   True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 64, 1),     (1, 64),       None,   True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 100, 100),  (100, 100),    None,   True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((2, 64, 1000, 100), (100, 1000),   None,   True, marks=pytest.mark.skip(reason="Fatal python error - xfail does not work. Error message: Fatal Python error: Segmentation fault; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),  # 4.3 Very large (thousands, 10s of thousands)         
+            ((2, 64, 10, 1000),  (1000, 10),    None,   True) ,  #29       # 4.4 Extreme ratios between height/width        
+            ((2, 64, 9920, 1),   (1, 9920),     None,   True) ,  #30       # 4.4 Extreme ratios between height/width 
+            ((2, 64, 10000, 1),  (1, 10000),    None,   True) ,  #31       # 4.4 Extreme ratios between height/width   
+            ((2, 64, 32, 64),    (64, 32),      None,   True) ,  #32       # 4.1 Divisible by 32
+            ((2, 64, 160, 96),   (96, 160),     None,   True) ,  #33       # 4.1 Divisible by 32
+            ((2, 64, 17, 41),    (41, 17),      None,   True) ,  #34       # 4.2 Prime numbers
+            ((2, 64, 89, 3),     (3, 89),       None,   True) ,  #35       # 4.2 Prime numbers
+            ]
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse, default_dram_params, prologue", get_input_shapes_prologued())
+def test_smm_operand_src_from_const_inputs_prologue(
+    input_shape_dense,
+    input_shape_sparse,
+    default_dram_params,
+    prologue,
+    test_device,
+    input_params=[], 
+    math_fidelity=None
+):
+    class Model(PyBudaModule):
+            def __init__(self, name, sparse_shape):
+                super().__init__(name)
+                self.add_constant("sparse")
+                self.set_constant("sparse", get_sparse_tensor(sparse_shape))
+
+            def forward(self, x):
+                smm1 = pybuda.op.SparseMatmul("smm1", self.get_constant("sparse"), x)
+                return smm1
+            
+    mod = Model("test_sparse_matmul_operand_src_from_const_inputs_prologue", input_shape_sparse)
+
+    input_shapes = tuple([input_shape_dense])
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = False
+    compiler_cfg.default_dram_parameters = default_dram_params
+    if (math_fidelity is not None):
+        compiler_cfg.default_math_fidelity = math_fidelity
+    verify_module(
+        mod,
+        input_shapes=input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        input_params=[input_params],
+    )
+    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/lc.input_tensor.smm1.0")
+    if prologue:
+        assert d['prologue']
+    else:
+        assert not d['prologue']
+    
+
+# We will not test all combinations of Data Format and Math Fidelity because it would be too much tests. 
+#   1. First we will choose Data Format to be Float16_b and test all Math Fidelity values
+#   2. Then we will set Math Fidelity to HiFi4 and test all Data Formats. 
+
+
+## 1.
+
+def get_input_shape_sparse(micro_batch_size=1):
+    return (4, 3)
+
+def get_input_shape_dense(micro_batch_size=1):
+    return (micro_batch_size, 64, 3, 4)
+
+verify_input_params=[ 
+                        {"dev_data_format": pybuda.DataFormat.Float16_b},
+                    ]
+compiler_math_fidelity = [
+                            pybuda.MathFidelity.LoFi,
+                            pybuda.MathFidelity.HiFi2,
+                            pybuda.MathFidelity.HiFi3,
+                            pybuda.MathFidelity.HiFi4,
+                         ]
+
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_smm_mf_inputs_from_host(test_device, math_fidelity):
+    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_mf_inputs_from_dram(test_device, math_fidelity):
+#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_mf_inputs_from_const_inputs_const_eval(test_device, math_fidelity):
+#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_mf_inputs_from_another_op(test_device, math_fidelity):
+#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_mf_inputs_from_tm_edge1(test_device, math_fidelity):
+#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_mf_inputs_from_tm_edge2(test_device, math_fidelity):
+#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+
+
+## 2.
+
+verify_input_params=[
+                        {"dev_data_format": pybuda.DataFormat.Bfp2},
+                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
+                        {"dev_data_format": pybuda.DataFormat.Bfp4},
+                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
+                        {"dev_data_format": pybuda.DataFormat.Bfp8},
+                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
+                        {"dev_data_format": pybuda.DataFormat.Float16},  
+                        {"dev_data_format": pybuda.DataFormat.Float16_b},
+                        {"dev_data_format": pybuda.DataFormat.Float32},
+                        {"dev_data_format": pybuda.DataFormat.Int8},
+                        {"dev_data_format": pybuda.DataFormat.Lf8},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
+                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
+                        {"dev_data_format": pybuda.DataFormat.UInt16},
+                    ]
+
+compiler_math_fidelity = pybuda.MathFidelity.HiFi4
+
+@pytest.mark.parametrize("input_params", verify_input_params)
+def test_smm_df_inputs_from_host(test_device, input_params):
+    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+# @pytest.mark.parametrize("input_params", verify_input_params)
+# def test_smm_df_inputs_from_dram(test_device, input_params):
+#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+# @pytest.mark.parametrize("input_params", verify_input_params)
+# def test_smm_df_inputs_from_const_inputs_const_eval(test_device, input_params):
+#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+# @pytest.mark.parametrize("input_params", verify_input_params)
+# def test_smm_df_inputs_from_another_op(test_device, input_params):
+#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+# @pytest.mark.parametrize("input_params", verify_input_params)
+# def test_smm_df_inputs_from_tm_edge1(test_device, input_params):
+#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+# @pytest.mark.parametrize("input_params", verify_input_params)
+# def test_smm_df_inputs_from_tm_edge2(test_device, input_params):
+#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+
+
+
+
 
 
 
@@ -135,6 +683,7 @@ def __init__(self):
             self.set_constant("sparse", pybuda.Tensor.create_from_torch(sparse, constant=True))
 
         def forward(self, x):
+            sparse = self.get_constant("sparse")
             out = pybuda.op.SparseMatmul("", self.get_constant("sparse"), x)
             return out
 

From 5a3491bd9bc59dd9455f423cd06598d2929c034a Mon Sep 17 00:00:00 2001
From: Nikola Obradovic <nobradovic@tenstorrent.com>
Date: Tue, 11 Jun 2024 14:54:25 +0000
Subject: [PATCH 005/116] Rework epoch break

lower to netlist: separate data-parallel epochs

Place epochs with dp nops only on MMIO chip, and insert an empty epoch
on non-MMIO chip. Adjust epoch ids accordingly.

(cherry picked from commit 375bb42e0c04b3050262d60db7b4b965958dabc3)
---
 pybuda/csrc/buda_passes.cpp                   |   3 +-
 pybuda/csrc/graph_lib/node_types.hpp          |  35 ++++--
 pybuda/csrc/lower_to_buda/netlist.cpp         | 112 ++++++++++++------
 pybuda/csrc/passes/pre_placer_buda_passes.cpp |   6 +-
 pybuda/csrc/passes/pre_placer_buda_passes.hpp |   2 +-
 pybuda/csrc/placer/lower_to_placer.cpp        |  36 +++++-
 6 files changed, 132 insertions(+), 62 deletions(-)

diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 35c3fb6a..b2c8ad99 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -382,8 +382,7 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
     // data parallel - insert nops and epoch breaks
     if (env_as<bool>("PYBUDA_N300_DATA_PARALLEL"))
     {
-        std::vector<std::string> dp_nops_to_epoch_break = insert_dataparallel_nops(lowered_graph.get());
-        op_names_to_epoch_break.push_back(dp_nops_to_epoch_break);
+        insert_dataparallel_nops(lowered_graph.get());
     }
 
     // At this point, there should be no more graph mutations.
diff --git a/pybuda/csrc/graph_lib/node_types.hpp b/pybuda/csrc/graph_lib/node_types.hpp
index c2fff250..327b0d91 100644
--- a/pybuda/csrc/graph_lib/node_types.hpp
+++ b/pybuda/csrc/graph_lib/node_types.hpp
@@ -489,16 +489,17 @@ class PyOpNode : public OpNode {
     void copy_parent_op_attributes(PyOpNode *node);
 };
 
-class BudaOpNode : public OpNode {
-
-private:
-    tt::DataFormat accumulate_df_  = tt::DataFormat::Float16_b;
-    tt::DataFormat intermediate_df_  = tt::DataFormat::Float16_b;
-    tt::MathFidelity math_fidelity_  = tt::MathFidelity::HiFi3;
+class BudaOpNode : public OpNode
+{
+   private:
+    tt::DataFormat accumulate_df_ = tt::DataFormat::Float16_b;
+    tt::DataFormat intermediate_df_ = tt::DataFormat::Float16_b;
+    tt::MathFidelity math_fidelity_ = tt::MathFidelity::HiFi3;
     std::shared_ptr<FusedOp> fused_op_ = nullptr;
     bool buffering_op_ = false;
+    bool data_parallel_nop_ = false;
 
-public:
+   public:
     BudaOpNode(const std::string &name, const std::string &op_type) : OpNode(name, op_type, NodeType::kBudaOp) {}
     BudaOpNode(const std::string &name, OpType op_type) : OpNode(name, op_type, NodeType::kBudaOp) {}
 
@@ -514,18 +515,30 @@ class BudaOpNode : public OpNode {
     void copy_lowered_op_attributes(PyOpNode *node);
     void copy_parent_op_attributes(BudaOpNode *node);
 
-    virtual std::unique_ptr<Node> clone(std::string const& name = "") override;
+    virtual std::unique_ptr<Node> clone(std::string const &name = "") override;
 
     void set_fused_op(std::shared_ptr<FusedOp> fused_op) { fused_op_ = fused_op; }
     bool is_fused_op() const { return fused_op_ != nullptr; }
-    std::shared_ptr<FusedOp> get_fused_op() const { TT_ASSERT(fused_op_ != nullptr); return fused_op_; }
+    std::shared_ptr<FusedOp> get_fused_op() const
+    {
+        TT_ASSERT(fused_op_ != nullptr);
+        return fused_op_;
+    }
 
     void set_buffering_op(bool buffering_op) { buffering_op_ = buffering_op; }
     bool is_buffering_op() const { return buffering_op_; }
 
-    #ifdef DEBUG
+    void set_data_parallel_nop(bool data_parallel_nop)
+    {
+        TT_ASSERT(!data_parallel_nop || "nop" == op_type().op);
+        data_parallel_nop_ = data_parallel_nop;
+    }
+
+    bool is_data_parallel_nop() const { return data_parallel_nop_; }
+
+#ifdef DEBUG
     std::shared_ptr<balancer::BudaOpNodeLegalizerFailureInfo> leg_debug_info = nullptr;
-    #endif
+#endif
 };
 
 class BudaNaryTMNode : public Node
diff --git a/pybuda/csrc/lower_to_buda/netlist.cpp b/pybuda/csrc/lower_to_buda/netlist.cpp
index 39c87e82..22742514 100644
--- a/pybuda/csrc/lower_to_buda/netlist.cpp
+++ b/pybuda/csrc/lower_to_buda/netlist.cpp
@@ -696,8 +696,21 @@ std::pair<int, int> get_epoch_allocate_deallocate(graphlib::Node *q, const place
     }
 }
 
+// Find out the updated epoch id after inserting empty epochs, only applies to n300 data parallel
+size_t get_updated_epoch_id(size_t epoch_id, const vector<size_t>& dp_epochs)
+{
+    size_t num_of_insertions = 0;
+    for (size_t dp_epoch: dp_epochs)
+    {
+        if (epoch_id > dp_epoch)
+            num_of_insertions++;
+    }
+    return epoch_id + num_of_insertions;
+}
+
 std::vector<program::Program> create_programs(
-    Graph *graph, placer::PlacerSolution &placer_solution, BudaGraph &buda_graph, const std::string &arch_string)
+    Graph *graph, placer::PlacerSolution &placer_solution, BudaGraph &buda_graph, const std::string &arch_string,
+    const vector<size_t> &dp_epochs)
 {
     std::vector<program::Program> programs;
 
@@ -729,7 +742,7 @@ std::vector<program::Program> create_programs(
             for (std::uint32_t epoch : epochs)
             {
                 input_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch](Node *node)
+                    [&graph, &placer_solution, epoch, &dp_epochs](Node *node)
                     {
                         if ((node->node_type() != graphlib::NodeType::kInput) &&
                             (node->node_type() != graphlib::NodeType::kQueue) &&
@@ -755,7 +768,7 @@ std::vector<program::Program> create_programs(
                             {
                                 if (
                                     // Our epoch
-                                    (placer_solution.name_to_op_placement.at(neighbour->name()).epoch_id() == epoch) &&
+                                    (get_updated_epoch_id(placer_solution.epoch_id(neighbour->name()), dp_epochs) == epoch) &&
 
                                     (
                                         // Input
@@ -799,7 +812,7 @@ std::vector<program::Program> create_programs(
             for (std::uint32_t epoch : epochs)
             {
                 parameter_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch](Node *node)
+                    [&graph, &placer_solution, epoch, &dp_epochs](Node *node)
                     {
                         if (node->node_type() != graphlib::NodeType::kInput)
                             return false;
@@ -812,7 +825,7 @@ std::vector<program::Program> create_programs(
                             {
                                 if (
                                     // Our epoch
-                                    (placer_solution.name_to_op_placement.at(user->name()).epoch_id() == epoch) &&
+                                    (get_updated_epoch_id(placer_solution.epoch_id(user->name()), dp_epochs) == epoch) &&
                                     ((node->as<graphlib::InputNode>()->is_parameter()) ||
                                      (node->as<graphlib::InputNode>()->is_constant())))
                                     return true;
@@ -832,7 +845,7 @@ std::vector<program::Program> create_programs(
             for (std::uint32_t epoch : epochs)
             {
                 gradient_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch, have_opt_epochs](Node *node)
+                    [&graph, &placer_solution, epoch, &dp_epochs, have_opt_epochs](Node *node)
                     {
                         if ((node->node_type() != graphlib::NodeType::kQueue) ||
                             (!node->as<graphlib::QueueNode>()->is_grad_accumulator()))
@@ -857,12 +870,12 @@ std::vector<program::Program> create_programs(
                         {
                             return
                                 // Bwd
-                                ((placer_solution.name_to_op_placement.at(producer->name()).epoch_id() == epoch) &&
+                                ((get_updated_epoch_id(placer_solution.epoch_id(producer->name()), dp_epochs) == epoch) &&
                                  producer->as<graphlib::BudaOpNode>()->is_gradient_op()) ||
 
                                 // Optimizer
                                 ((consumer != nullptr) &&
-                                 (placer_solution.name_to_op_placement.at(consumer->name()).epoch_id() == epoch));
+                                 (get_updated_epoch_id(placer_solution.epoch_id(consumer->name()), dp_epochs) == epoch));
                         }
                         catch (std::out_of_range &e)
                         {
@@ -962,7 +975,7 @@ std::vector<program::Program> create_programs(
                                 num_entries,
                                 microbatch_size);
                             // Need to increment static queue rd/wtr ptrs as queue is persistant
-                            uint32_t temporal_epoch_id = placer_solution.temporal_epoch_id(epoch);
+                            uint32_t temporal_epoch_id = get_updated_epoch_id(placer_solution.temporal_epoch_id(epoch), dp_epochs);
                             const auto &[lptr, gptr] =
                                 qvars(q, temporal_epoch_id, program::Variable::ShadowType::NONE, true);
 
@@ -998,7 +1011,7 @@ std::vector<program::Program> create_programs(
                         continue;
                     }
 
-                    uint32_t temporal_epoch_id = placer_solution.temporal_epoch_id(epoch);
+                    uint32_t temporal_epoch_id = get_updated_epoch_id(placer_solution.temporal_epoch_id(epoch), dp_epochs);
                     bool read_global;
                     if (q->as<graphlib::QueueNode>()->is_output())
                     {
@@ -1437,7 +1450,7 @@ static std::vector<std::size_t> get_input_dram_io_buf_size_tiles(
             return input_dram_io_buf_size_tiles;
         }
 
-        const int pipegen_available_dram_io_space_per_stream = free_l1_space / num_dram_readers;
+        const int pipegen_available_dram_io_space_per_stream = free_l1_space / num_dram_readers; // try /2 TODO
         int current_stream_available_dram_io_space = pipegen_available_dram_io_space_per_stream;
 
         for (std::size_t input_idx = 0; input_idx < operands.size(); ++input_idx)
@@ -1673,50 +1686,71 @@ BudaNetlist lower_to_buda_netlist(
         }
     }
 
-    size_t last_epoch_id = -1; // final epoch for dp, TODO
-    for (const auto& [key, value] : placer_solution.name_to_op_placement)
-    {
-        if (key.find("dp_nop") != std::string::npos)
-        {
-            last_epoch_id = value.epoch_id();
-            break;
-        }
-    }
-
-    for (size_t epoch_id = 0; epoch_id < buda_graph.epoch_types.size(); ++epoch_id)
+    vector<size_t> dp_epochs;
+    unordered_map<int, tt::placer::EpochInfo> epoch_info_map;
+    for (size_t epoch_id = 0; epoch_id < epoch_count; ++epoch_id)
     {
         int chip_id = placer_solution.epoch_id_to_chip.at(epoch_id);
-        if (env_as<bool>("PYBUDA_N300_DATA_PARALLEL") && epoch_id != last_epoch_id)
+        bool is_dp_epoch = false;
+        if (env_as<bool>("PYBUDA_N300_DATA_PARALLEL"))
         {
-            buda_graph.epoch_target_devices.push_back({BudaDevice(0), BudaDevice(1)});
+            is_dp_epoch = true;
+            for (const placer::OpPlacement &placement: placer_solution.epoch_id_to_op_placement[epoch_id])
+            {
+                BudaOpNode* op_node = static_cast<BudaOpNode*>(graph->get_node_by_name(placement.name));
+                if (!op_node->is_data_parallel_nop())
+                {
+                    is_dp_epoch = false;
+                    break;
+                }
+            }
+
+            auto epoch_info = placer_solution.epoch_id_to_epoch_info.at(epoch_id);
+            epoch_info_map[epoch_id + dp_epochs.size()] = epoch_info;
+
+            if (is_dp_epoch)
+            {
+                dp_epochs.push_back(epoch_id);
+                TT_ASSERT(chip_id == 0, "MMIO ops are expected to be placed on chip 0");
+                buda_graph.epoch_target_devices.push_back({BudaDevice(chip_id)});
+
+                // insert an empty graph on the non-MMIO chip (1 by default)
+                buda_graph.ops.insert(buda_graph.ops.begin() + epoch_id + dp_epochs.size(), std::vector<BudaOp>());
+                buda_graph.epoch_types.insert(buda_graph.epoch_types.begin() + epoch_id + dp_epochs.size(), buda_graph.epoch_types.at(epoch_id));
+                buda_graph.epoch_target_devices.push_back({BudaDevice(1)});
+
+                epoch_info_map[epoch_id + dp_epochs.size()] = {
+                    .global_epoch_id = epoch_info.global_epoch_id,
+                    .temporal_epoch_id = epoch_info.temporal_epoch_id,
+                    .spatial_epoch_id = 1,
+                    .epoch_type = epoch_info.epoch_type
+                };
+            }
+            else
+            {
+                buda_graph.epoch_target_devices.push_back({BudaDevice(0), BudaDevice(1)});
+            }
         }
         else
         {
             buda_graph.epoch_target_devices.push_back({BudaDevice(chip_id)});
         }
+
         buda_graph.epoch_to_temporal_epoch_id.push_back(placer_solution.temporal_epoch_id(epoch_id));
         buda_graph.epoch_to_subgraph_index.push_back(placer_solution.epoch_id_to_subgraph_index[epoch_id]);
+        if (is_dp_epoch)
+        {
+            buda_graph.epoch_to_temporal_epoch_id.push_back(placer_solution.temporal_epoch_id(epoch_id));
+            buda_graph.epoch_to_subgraph_index.push_back(placer_solution.epoch_id_to_subgraph_index[epoch_id]);
+        }
     }
 
     if (env_as<bool>("PYBUDA_N300_DATA_PARALLEL"))
     {
-        // insert an empty graph for the last temporal epoch on chip 1 (non MMIO)
-        buda_graph.ops.push_back({});
-        buda_graph.epoch_types.push_back(buda_graph.epoch_types.back());
-        //buda_graph.epoch_types.push_back(graphlib::NodeEpochType::Forward);
-        buda_graph.epoch_target_devices.push_back({BudaDevice(1)});
-        buda_graph.epoch_to_temporal_epoch_id.push_back(buda_graph.epoch_to_temporal_epoch_id.back());
-        buda_graph.epoch_to_subgraph_index.push_back(0);
-
-        placer_solution.epoch_id_to_epoch_info[epoch_count] = {
-            .global_epoch_id=placer_solution.epoch_id_to_epoch_info[epoch_count-1].global_epoch_id,
-            .temporal_epoch_id=buda_graph.epoch_to_temporal_epoch_id.back(),
-            .spatial_epoch_id=1,
-            .epoch_type=buda_graph.epoch_types.back()
-        };
+        placer_solution.epoch_id_to_epoch_info = epoch_info_map;
     }
 
-    net.programs = create_programs(graph, placer_solution, buda_graph, arch_string);
+    net.programs = create_programs(graph, placer_solution, buda_graph, arch_string, dp_epochs);
     net.chip_ids = chip_ids;
     net.arch_string = arch_string;
 
diff --git a/pybuda/csrc/passes/pre_placer_buda_passes.cpp b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
index 326e4642..497afd5a 100644
--- a/pybuda/csrc/passes/pre_placer_buda_passes.cpp
+++ b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
@@ -1765,9 +1765,8 @@ void insert_user_defined_queues(
 }
 
 // return ops to add epoch breaks on
-std::vector<std::string> insert_dataparallel_nops(graphlib::Graph *graph)
+void insert_dataparallel_nops(graphlib::Graph *graph)
 {
-    std::vector<std::string> dp_nops_to_epoch_break;
     for (Node *n: graph->nodes_by_type(graphlib::NodeType::kOutput))
     {
         auto output_node = n->as<graphlib::OutputNode>();
@@ -1784,7 +1783,7 @@ std::vector<std::string> insert_dataparallel_nops(graphlib::Graph *graph)
                     graph->get_subgraph_id_for_node(n->id()));
             nop->copy_parent_op_attributes(source->as<graphlib::BudaOpNode>());
             nop->as<graphlib::TaggedNode>()->add_tags(source->as<graphlib::TaggedNode>()->get_tags());
-            dp_nops_to_epoch_break.emplace_back(source->name() + "_dp_nop." + std::to_string(dp_idx));
+            nop->set_data_parallel_nop(true);
             if (dp_idx == 0) {
                 graphlib::insert_node_on_edge(graph, edges[0], nop);
             } else {
@@ -1801,7 +1800,6 @@ std::vector<std::string> insert_dataparallel_nops(graphlib::Graph *graph)
             }
         }
     }
-    return dp_nops_to_epoch_break;
 }
 
 }  // namespace tt
diff --git a/pybuda/csrc/passes/pre_placer_buda_passes.hpp b/pybuda/csrc/passes/pre_placer_buda_passes.hpp
index 54a35705..b6e0cc68 100644
--- a/pybuda/csrc/passes/pre_placer_buda_passes.hpp
+++ b/pybuda/csrc/passes/pre_placer_buda_passes.hpp
@@ -113,6 +113,6 @@ void insert_recompute_ops(graphlib::Graph *graph);
 void insert_user_defined_queues(
     graphlib::Graph *graph, const std::vector<std::tuple<std::string, std::string, int>> &insert_queues);
 
-std::vector<std::string> insert_dataparallel_nops(graphlib::Graph *graph);
+void insert_dataparallel_nops(graphlib::Graph *graph);
 
 }
diff --git a/pybuda/csrc/placer/lower_to_placer.cpp b/pybuda/csrc/placer/lower_to_placer.cpp
index a9182e27..8377567a 100644
--- a/pybuda/csrc/placer/lower_to_placer.cpp
+++ b/pybuda/csrc/placer/lower_to_placer.cpp
@@ -289,23 +289,49 @@ static unordered_set<string> tag_ops_for_epoch_or_chip_break(
                     ops_tagged_for_epoch_break.insert(scheduled_op);
                     break;
                 }
-
             }
         }
     }
 
-    // Add epoch breaks between subgraphs
+    // Add epoch breaks between subgraphs.
+    // Add epoch breaks to split data parallel NOPs from other regular OPs.
+    //
     unsigned int prev_subgraph_id = graph->get_subgraph_id_for_node(graph->get_node_by_name(scheduled_ops[0])->id());
-    for (auto op : scheduled_ops)
+    bool in_data_parallel_nop_group = false;
+    for (const string& op : scheduled_ops)
     {
-        unsigned int subgraph_id = graph->get_subgraph_id_for_node(graph->get_node_by_name(op)->id());
+        Node* node = graph->get_node_by_name(op);
+        unsigned int subgraph_id = graph->get_subgraph_id_for_node(node->id());
         TT_ASSERT(subgraph_id >= prev_subgraph_id);
         if (subgraph_id != prev_subgraph_id)
         {
             ops_tagged_for_epoch_break.insert(op);
-            log_debug(LogPlacer, "Epoch break due to subgraph at: {}",op);
+            log_debug(LogPlacer, "Epoch break due to subgraph at: {}", op);
             prev_subgraph_id = subgraph_id;
         }
+
+        if (node->node_type() == NodeType::kBudaOp)
+        {
+            BudaOpNode* buda_node = static_cast<BudaOpNode*>(node);
+
+            if (buda_node->is_data_parallel_nop())
+            {
+                if (!in_data_parallel_nop_group)
+                {
+                    // Start of data parallel NOP group. Add an epoch break.
+                    //
+                    in_data_parallel_nop_group = true;
+                    ops_tagged_for_epoch_break.insert(op);
+                }
+            }
+            else if (in_data_parallel_nop_group)
+            {
+                // End of data parallel NOP group, add an epoch break.
+                //
+                in_data_parallel_nop_group = false;
+                ops_tagged_for_epoch_break.insert(op);
+            }
+        }
     }
     return ops_tagged_for_epoch_break;
 }

From a820f4d1f9bb08513ed29329c5b611f83c145ee9 Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Wed, 26 Jun 2024 20:21:31 +0000
Subject: [PATCH 006/116] FE changes for multichip multi card data parallel

(cherry picked from commit e66db5e4b6fc3b8e369b160a51f46ed483fd68dd)
---
 pybuda/pybuda/tools/tti_data_parallel.py      | 545 +++++++-----------
 pybuda/pybuda/ttdevice.py                     |   4 +-
 pybuda/pybuda/tti/archive.py                  | 368 +++++++++---
 pybuda/pybuda/tti/tti.py                      |   4 +-
 pybuda/test/benchmark/benchmark.py            |   2 +-
 .../benchmark/run_benchmark_tti_data_parallel |   4 +-
 pybuda/test/tti/test_tti_data_parallel.py     | 149 ++++-
 7 files changed, 625 insertions(+), 451 deletions(-)

diff --git a/pybuda/pybuda/tools/tti_data_parallel.py b/pybuda/pybuda/tools/tti_data_parallel.py
index b85e37f1..04be5763 100644
--- a/pybuda/pybuda/tools/tti_data_parallel.py
+++ b/pybuda/pybuda/tools/tti_data_parallel.py
@@ -12,9 +12,8 @@
 import shutil
 from typing import Iterable, Optional, Dict, List, Tuple, Union, Any
 import pybuda
-from pybuda.pybudaglobal import pybuda_reset
 from loguru import logger
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
 
 OUTPUT_TTI_NAME = "parallel_tti_run.tti"
@@ -27,16 +26,16 @@ class ForwardRunInputs:
     inputs: Iterable[torch.Tensor] = None
         
     @staticmethod
-    def get_inputs_per_device(all_inputs: "ForwardRunInputs", num_devices: int) -> List["ForwardRunInputs"]:
-        run_inputs_per_device = split_tensor_batch(all_inputs.inputs, num_devices)
-        inputs_per_device: List[ForwardRunInputs] = []
-        for device_index in range(num_devices):
-            inputs_per_device.append(
+    def get_inputs_per_card(all_inputs: "ForwardRunInputs", num_cards: int) -> List["ForwardRunInputs"]:
+        run_inputs_per_card = split_tensor_batch(all_inputs.inputs, num_cards)
+        inputs_per_card: List[ForwardRunInputs] = []
+        for card_index in range(num_cards):
+            inputs_per_card.append(
                 ForwardRunInputs(
-                    inputs=run_inputs_per_device[device_index]
+                    inputs=run_inputs_per_card[card_index]
                 )
             )
-        return inputs_per_device
+        return inputs_per_card
         
 @dataclass
 class GenerativeRunInputs:
@@ -54,18 +53,18 @@ def __post_init__(self):
             
     
     @staticmethod
-    def get_inputs_per_device(all_inputs: "GenerativeRunInputs", num_devices: int) -> List["GenerativeRunInputs"]:
+    def get_inputs_per_card(all_inputs: "GenerativeRunInputs", num_cards: int) -> List["GenerativeRunInputs"]:
         # autograd does not support crossing process boundaries, this is an issue for whisper
         # detach all input tensors from compute graph to bypass this issue
-        compile_inputs_per_device = detach_all_tensors(split_tensor_batch(all_inputs.compile_inputs, num_devices))
-        run_inputs_per_device = detach_all_tensors(split_tensor_batch(all_inputs.run_inputs, num_devices))
+        compile_inputs_per_card = detach_all_tensors(split_tensor_batch(all_inputs.compile_inputs, num_cards))
+        run_inputs_per_card = detach_all_tensors(split_tensor_batch(all_inputs.run_inputs, num_cards))
         
-        inputs_per_device: List[GenerativeRunInputs] = []
-        for device_index in range(num_devices):
-            inputs_per_device.append(
+        inputs_per_card: List[GenerativeRunInputs] = []
+        for card_index in range(num_cards):
+            inputs_per_card.append(
                 GenerativeRunInputs(
-                    compile_inputs=compile_inputs_per_device[device_index],
-                    run_inputs=run_inputs_per_device[device_index],
+                    compile_inputs=compile_inputs_per_card[card_index],
+                    run_inputs=run_inputs_per_card[card_index],
                     num_tokens_to_generate=all_inputs.num_tokens_to_generate,
                     write_index=all_inputs.write_index,
                     first_current_index=all_inputs.first_current_index,
@@ -73,17 +72,18 @@ def get_inputs_per_device(all_inputs: "GenerativeRunInputs", num_devices: int) -
                 )
             )
             
-        return inputs_per_device
+        return inputs_per_card
 
 
 @dataclass
 class ForwardRunConfig:
-    chip_id: int = 0
+    chip_ids: List[int] = field(default_factory=list)
     inputs: ForwardRunInputs = None
     tti_path: str = ""
     loop_count: int = 0
     
     def __post_init__(self):
+        assert self.chip_ids
         assert self.inputs
         assert self.tti_path
         assert self.loop_count
@@ -94,28 +94,15 @@ def inputs_for_compile(self):
     def inputs_for_run(self):
         return self.inputs.inputs
     
-    @staticmethod
-    def get_configs_per_device(inputs: ForwardRunInputs, loop_count: int, tti_path: str, device_ids: List[int]):
-        configs: List[ForwardRunConfig] = []
-        per_device_inputs = inputs.split_inputs_per_device(len(device_ids))
-        for device_index, device in enumerate(device_ids):
-            configs.append(
-                ForwardRunConfig(
-                    chip_id=device,
-                    run_inputs=per_device_inputs[device_index],
-                    loop_count=loop_count,
-                    tti_path=tti_path,
-                )
-            )
-        return configs
     
 @dataclass
 class GenerativeRunConfig:
-    chip_id: int = 0
+    chip_ids: List[int] = field(default_factory=list)
     inputs: GenerativeRunInputs = None
     tti_path: str = ""
     
     def __post_init__(self):
+        assert self.chip_ids
         assert self.inputs
         assert self.tti_path
     
@@ -125,20 +112,6 @@ def inputs_for_compile(self):
     def inputs_for_run(self):
         return self.inputs.run_inputs
     
-    @staticmethod
-    def get_configs_per_device(inputs: GenerativeRunInputs, tti_path: str, device_ids: List[int]):
-        configs: List[GenerativeRunConfig] = []
-        per_device_inputs = inputs.split_inputs_per_device(len(device_ids))
-        for device_index, device in enumerate(device_ids):
-            configs.append(
-                ForwardRunConfig(
-                    chip_id=device,
-                    inputs=per_device_inputs[device_index],
-                    tti_path=tti_path,
-                )
-            )
-        return configs
-    
 @dataclass
 class RunEvents:
     # Set by the child process when its done running
@@ -199,26 +172,26 @@ class RunResult:
     outputs: List[List[torch.Tensor]] = None
     
     # Device id to start time
-    per_device_start_time: Dict[int, float] = None
+    per_card_start_time: Dict[int, float] = None
     
     # Device id to end time
-    per_device_end_time: Dict[int, float] = None
+    per_card_end_time: Dict[int, float] = None
     
     def __post_init__(self):
-        assert self.per_device_start_time.keys() == self.per_device_end_time.keys()
+        assert self.per_card_start_time.keys() == self.per_card_end_time.keys()
         
-    def get_per_device_runtime(self):
-        per_device_runtime = {}
-        for device_id in self.per_device_start_time.keys():
-            per_device_runtime[device_id] = self.per_device_end_time[device_id] - self.per_device_start_time[device_id]
+    def get_per_card_runtime(self):
+        per_card_runtime = {}
+        for device_id in self.per_card_start_time.keys():
+            per_card_runtime[device_id] = self.per_card_end_time[device_id] - self.per_card_start_time[device_id]
             
-        return per_device_runtime
+        return per_card_runtime
     
     def get_earliest_start(self):
-        return min(self.per_device_start_time.values())
+        return min(self.per_card_start_time.values())
     
     def get_latest_end(self):
-        return max(self.per_device_end_time.values())
+        return max(self.per_card_end_time.values())
     
     def get_total_runtime(self):
         return self.get_latest_end() - self.get_earliest_start()
@@ -232,7 +205,7 @@ def _multi_thread_forward_run(config: ForwardRunConfig, events: RunEvents, outpu
         # Create ethernet map is not process safe
         events.process_start_event.set()
         
-        tt0 = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_override=config.chip_id)
+        tt0 = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_overrides=config.chip_ids)
         
         # For the first device process, set the event to notify the main process the tti has been unzipped
         # So that the main process can launch other processes
@@ -308,105 +281,45 @@ def pop_outputs_thread(output_q, all_outputs, loop_count: int):
 
     @staticmethod
     def _create_run_result(
-         # List of outputs per device, per loop
-        outputs_per_device: List[List[List[torch.tensor]]], 
-        per_device_runtime: Dict[int, Tuple[float, float]]
+         # List of outputs per card, per loop
+        outputs_per_card: List[List[List[torch.tensor]]], 
+        per_card_runtime: Dict[int, Tuple[float, float]]
     ):
         # Merge the outputs from all devices
-        num_devices = len(outputs_per_device)
-        num_loops = len(outputs_per_device[0])
-        single_loop_output_len = len(outputs_per_device[0][0])
-        
-        assert len(per_device_runtime) == num_devices
+        num_cards = len(outputs_per_card)
+        num_loops = len(outputs_per_card[0])
+        
+        # when running with n300 data parallel, the outputs are further split into two
+        # for example if the output of the module should be [tensor(256, 1000)], it will be split into [tensor(128, 1000), tensor(128, 1000)]
+        # thus, we need to merge these outputs back into [tensor(256, 1000)]
+        if os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1":
+            for card_index in range(num_cards):
+                for loop_idx in range(num_loops):
+                    total_num_output_tensors = len(outputs_per_card[card_index][loop_idx])
+                    assert total_num_output_tensors % 2 == 0, "Number of output tensors in n300 data parallel should be even"
+                    merged_outputs = []
+                    # Step over the outputs, merge every adjacent pair
+                    for tensor_idx in range(0, total_num_output_tensors, 2):
+                        merged_output = torch.cat([outputs_per_card[card_index][loop_idx][tensor_idx], outputs_per_card[card_index][loop_idx][tensor_idx + 1]], dim=0)
+                        merged_outputs.append(merged_output)
+                    outputs_per_card[card_index][loop_idx] = merged_outputs
+                    
+        single_loop_output_len = len(outputs_per_card[0][0])
+        
+        assert len(per_card_runtime) == num_cards
         
         merged_outputs_per_loop = []
         for loop_idx in range(num_loops):
             merged_outputs_this_loop = []
             for output_idx in range(single_loop_output_len):
-                output_per_device = [outputs_per_device[device_index][loop_idx][output_idx] for device_index in range(num_devices)]
-                merged_outputs_this_loop.append(torch.cat(output_per_device, dim=0))
+                output_per_card = [outputs_per_card[card_index][loop_idx][output_idx] for card_index in range(num_cards)]
+                merged_outputs_this_loop.append(torch.cat(output_per_card, dim=0))
             merged_outputs_per_loop.append(merged_outputs_this_loop)
-            
-        per_device_start_time = {device_id: start_end[0] for device_id, start_end in per_device_runtime.items()}
-        per_device_end_time = {device_id: start_end[1] for device_id, start_end in per_device_runtime.items()}
-        
-        return RunResult(merged_outputs_per_loop, per_device_start_time, per_device_end_time)
-    
-    @staticmethod
-    def run(
-        configs: List[ForwardRunConfig],
-        output_dir: str,
-        sync_at_run_start: bool,
-        rm_tmp_dirs: bool,
-    ):
-        procs = []
-        device_ids = [config.chip_id for config in configs]
-        num_devices = len(device_ids)
-        
-        mp_context = torch.multiprocessing.get_context('spawn')
-        all_events: List[RunEvents] = []
-        all_output_wrappers: List[RunOutputs] = []
-        # Shared events 
-        kill_event = mp_context.Event()
-        run_event = mp_context.Event() if sync_at_run_start else None
-
-        # Temporary directories for each device to dump intermediates such as outputs
-        tmp_dirs = [os.path.join(output_dir, f"tmp_device_{device_id}") for device_id in device_ids]
-        for tmp_dir in tmp_dirs:
-            os.makedirs(tmp_dir, exist_ok=True)
-        
-        for device_index, config in enumerate(configs):
-            chip_id = config.chip_id
-            events = RunEvents(
-                run_event=run_event,
-                kill_event=kill_event,
-                process_start_event=mp_context.Event(),
-                done_event=mp_context.Event(),
-                tti_first_load_event=mp_context.Event() if device_index == 0 else None,
-                initialize_completed_event=mp_context.Event() if sync_at_run_start else None,
-            )
-            output_wrapper = RunOutputs(
-                output_tensors_path=os.path.join(tmp_dirs[device_index], f"output_tensors_{chip_id}.pth"),
-                perf_q=mp_context.Queue(),
-            )
-            all_events.append(events)
-            all_output_wrappers.append(output_wrapper)
-            p = mp_context.Process(
-                target=ForwardRun._multi_thread_forward_run, 
-                args=(config, events, output_wrapper)
-            )
-            p.start()
-            procs.append(p)
-            events.process_start_event.wait()
-            if events.tti_first_load_event:
-                events.tti_first_load_event.wait()
-
-        if sync_at_run_start:
-            for device_events in all_events:
-                device_events.wait_for_initialize_complete()
                 
-            logger.info(f"Initialize completed on all {num_devices} devices, launching run")
-            run_event.set()
-        
-        for device_events in all_events:
-            device_events.wait_for_run_complete()
-        
-        outputs_per_device = [output_wrapper.get_output_tensors() for output_wrapper in all_output_wrappers]
-        per_device_start_end = {device_ids[i]: all_output_wrappers[i].get_start_end_time() for i in range(num_devices)}
+        per_card_start_time = {device_id: start_end[0] for device_id, start_end in per_card_runtime.items()}
+        per_card_end_time = {device_id: start_end[1] for device_id, start_end in per_card_runtime.items()}
         
-        # Terminate the processes after reading the outputs
-        kill_event.set()
-        for device_index, p in enumerate(procs):
-            p.join()
-            logger.info(f"Chip {device_ids[device_index]} finished run successfully")
-
-        # Clean up intermediate directories
-        if rm_tmp_dirs:
-            logger.info("Cleaning up temporary directories")
-            for tmp_dir in tmp_dirs:
-                shutil.rmtree(tmp_dir)
-            
-        return ForwardRun._create_run_result(outputs_per_device, per_device_start_end)
+        return RunResult(merged_outputs_per_loop, per_card_start_time, per_card_end_time)
 
 # Namespace for generative run APIs
 class GenerativeRun:
@@ -420,7 +333,7 @@ def _single_thread_generative_model_run(config: GenerativeRunConfig, events: Run
         compile_inputs = config.inputs_for_compile()
         run_inputs = config.inputs_for_run()
         
-        first_device = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_override=config.chip_id)
+        first_device = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_overrides=config.chip_ids)
         
         # For the first device process, set the event to notify the main process the tti has been unzipped
         # So that the main process can launch other processes
@@ -514,123 +427,151 @@ def _single_thread_generative_model_run(config: GenerativeRunConfig, events: Run
         events.done_event.set()
         events.kill_event.wait()
 
+    # TODO: Implement output merging for n300 data-parallel generative runs once its supported
     @staticmethod
     def _create_run_result(
-         # List of outputs per device, per loop
-        outputs_per_device: List[List[List[torch.tensor]]], 
-        per_device_runtime: Dict[int, Tuple[float, float]]
+        # List of outputs per card
+        # each inner list is the list of generated tokens of that card, of length num_tokens_to_generate
+        outputs_per_card: List[List[torch.tensor]], 
+        per_card_runtime: Dict[int, Tuple[float, float]]
     ):
-        per_device_start_time = {device_id: start_end[0] for device_id, start_end in per_device_runtime.items()}
-        per_device_end_time = {device_id: start_end[1] for device_id, start_end in per_device_runtime.items()}
+        per_card_start_time = {device_id: start_end[0] for device_id, start_end in per_card_runtime.items()}
+        per_card_end_time = {device_id: start_end[1] for device_id, start_end in per_card_runtime.items()}
         
-        return RunResult(outputs_per_device, per_device_start_time, per_device_end_time)
+        return RunResult(outputs_per_card, per_card_start_time, per_card_end_time)
     
-    @staticmethod
-    def run(
-        configs: List[GenerativeRunConfig],
-        output_dir: str,
-        sync_at_run_start: bool,
-        rm_tmp_dirs: bool,
-    ):
-        procs = []
-        device_ids = [config.chip_id for config in configs]
-        num_devices = len(device_ids)
-        
-        mp_context = torch.multiprocessing.get_context('spawn')
-        all_events: List[RunEvents] = []
-        all_output_wrappers: List[RunOutputs] = []
-        # Shared events 
-        kill_event = mp_context.Event()
-        run_event = mp_context.Event() if sync_at_run_start else None
+def _encode_chip_ids(chip_ids: List[int]) -> str:
+    return "_".join([str(chip_id) for chip_id in chip_ids])
 
-        # Temporary directories for each device to dump intermediates such as outputs
-        tmp_dirs = [os.path.join(output_dir, f"tmp_device_{device_id}") for device_id in device_ids]
-        for tmp_dir in tmp_dirs:
-            os.makedirs(tmp_dir, exist_ok=True)
-        
-        for device_index, config in enumerate(configs):
-            chip_id = config.chip_id
-            events = RunEvents(
-                run_event=run_event,
-                kill_event=kill_event,
-                process_start_event=mp_context.Event(),
-                done_event=mp_context.Event(),
-                tti_first_load_event=mp_context.Event() if device_index == 0 else None,
-                initialize_completed_event=mp_context.Event() if sync_at_run_start else None,
-            )
-            output_wrapper = RunOutputs(
-                output_tensors_path=os.path.join(tmp_dirs[device_index], f"output_tensors_{chip_id}.pth"),
-                perf_q=mp_context.Queue(),
-            )
-            all_events.append(events)
-            all_output_wrappers.append(output_wrapper)
-            p = mp_context.Process(
-                target=GenerativeRun._single_thread_generative_model_run, 
-                args=(config, events, output_wrapper)
-            )
-            p.start()
-            procs.append(p)
-            events.process_start_event.wait()
-            if events.tti_first_load_event:
-                events.tti_first_load_event.wait()
+def _initialize_tti_image(
+    output_dir: str,
+    precompiled_tti_path: Optional[str] = None,
+):
+    # copy tti over to the output directory if it isn't already there
+    precompiled_tti_path = os.path.realpath(precompiled_tti_path)
+    precompiled_tti_name = os.path.basename(precompiled_tti_path)
+    image_path = os.path.join(output_dir, precompiled_tti_name)
+    if os.path.abspath(precompiled_tti_path) != os.path.abspath(image_path):
+        shutil.copy(precompiled_tti_path, image_path)
+            
+    return image_path
 
-        if sync_at_run_start:
-            for device_events in all_events:
-                device_events.wait_for_initialize_complete()
-                
-            logger.info(f"Initialize completed on all {num_devices} devices, launching run")
-            run_event.set()
-        
-        for device_events in all_events:
-            device_events.wait_for_run_complete()
-        
-        outputs_per_device = [output_wrapper.get_output_tensors() for output_wrapper in all_output_wrappers]
-        per_device_start_end = {device_ids[i]: all_output_wrappers[i].get_start_end_time() for i in range(num_devices)}
+def _run(
+    run_mode: RunMode,
+    configs: Union[List[ForwardRunConfig], List[GenerativeRunConfig]],
+    output_dir: str,
+    sync_at_run_start: bool,
+    rm_tmp_dirs: bool,
+):
+    procs = []
+    device_ids_per_card = [config.chip_ids for config in configs]
+    num_cards = len(device_ids_per_card)
+    
+    mp_context = torch.multiprocessing.get_context('spawn')
+    all_events: List[RunEvents] = []
+    all_output_wrappers: List[RunOutputs] = []
+    # Shared events 
+    kill_event = mp_context.Event()
+    run_event = mp_context.Event() if sync_at_run_start else None
+
+    if run_mode == RunMode.FORWARD:
+        runner = ForwardRun._multi_thread_forward_run
         
-        # Terminate the processes after reading the outputs
-        kill_event.set()
-        for device_index, p in enumerate(procs):
-            p.join()
-            logger.info(f"Chip {device_ids[device_index]} finished run successfully")
+    elif run_mode == RunMode.GENERATIVE:
+        runner = GenerativeRun._single_thread_generative_model_run
+    
+    # Temporary directories for each device to dump intermediates such as outputs
+    tmp_dirs = [os.path.join(output_dir, f"tmp_device_{_encode_chip_ids(chip_ids)}") for chip_ids in device_ids_per_card]
+    for tmp_dir in tmp_dirs:
+        os.makedirs(tmp_dir, exist_ok=True)
+    
+    for card_index, config in enumerate(configs):
+        events = RunEvents(
+            run_event=run_event,
+            kill_event=kill_event,
+            process_start_event=mp_context.Event(),
+            done_event=mp_context.Event(),
+            tti_first_load_event=mp_context.Event() if card_index == 0 else None,
+            initialize_completed_event=mp_context.Event() if sync_at_run_start else None,
+        )
+        output_wrapper = RunOutputs(
+            output_tensors_path=os.path.join(tmp_dirs[card_index], f"output_tensors_{_encode_chip_ids(config.chip_ids)}.pth"),
+            perf_q=mp_context.Queue(),
+        )
+        all_events.append(events)
+        all_output_wrappers.append(output_wrapper)
+        p = mp_context.Process(
+            target=runner, 
+            args=(config, events, output_wrapper)
+        )
+        p.start()
+        procs.append(p)
+        events.process_start_event.wait()
+        if events.tti_first_load_event:
+            events.tti_first_load_event.wait()
 
-        # Clean up intermediate directories
-        if rm_tmp_dirs:
-            logger.info("Cleaning up temporary directories")
-            for tmp_dir in tmp_dirs:
-                shutil.rmtree(tmp_dir)
+    if sync_at_run_start:
+        for device_events in all_events:
+            device_events.wait_for_initialize_complete()
             
-        return GenerativeRun._create_run_result(outputs_per_device, per_device_start_end)
+        logger.info(f"Initialize completed on all {num_cards} cards, launching run")
+        run_event.set()
+    
+    for device_events in all_events:
+        device_events.wait_for_run_complete()
+    
+    outputs_per_card = [output_wrapper.get_output_tensors() for output_wrapper in all_output_wrappers]
+    per_card_start_end = {i: all_output_wrappers[i].get_start_end_time() for i in range(num_cards)}
+    
+    # Terminate the processes after reading the outputs
+    kill_event.set()
+    for proc_id, p in enumerate(procs):
+        p.join()
+        logger.info(f"Devices {device_ids_per_card[proc_id]} finished run successfully")
 
-def split_tensor_batch(input_data, num_devices: int):
+    # Clean up intermediate directories
+    if rm_tmp_dirs:
+        logger.info("Cleaning up temporary directories")
+        for tmp_dir in tmp_dirs:
+            shutil.rmtree(tmp_dir)
+    
+    if run_mode == RunMode.FORWARD:
+        run_result: RunResult = ForwardRun._create_run_result(outputs_per_card, per_card_start_end) 
+    elif run_mode == RunMode.GENERATIVE:
+        run_result: RunResult = GenerativeRun._create_run_result(outputs_per_card, per_card_start_end)
+        
+    return run_result
+    
+def split_tensor_batch(input_data, num_cards: int):
     '''
     Splits tensors in input data recursively
-    If input_data = ((tensor1, tensor2), tensor3) and we have 2 devices
+    If input_data = ((tensor1, tensor2), tensor3) and we have 2 cards
     returns [
         [[first_half_tensor1, first_half_tensor2], first_half_tensor3]],
         [[second_half_tensor1, second_half_tensor2], second_half_tensor3]]
     ]
     '''
-    inputs_per_device = [[] for _ in range(num_devices)]
+    inputs_per_card = [[] for _ in range(num_cards)]
     def _split_tensors(input_data, containers: List[List[Any]]):
-        num_devices = len(containers)
+        num_cards = len(containers)
         if isinstance(input_data, torch.Tensor):
-            assert input_data.shape[0] % num_devices == 0, "Number of devices must divide the total batch size evenly"
-            input_split = torch.tensor_split(input_data, num_devices, dim=0)
-            for device_index in range(num_devices):
-                containers[device_index] = input_split[device_index]
+            assert input_data.shape[0] % num_cards == 0, "Number of cards must divide the total batch size evenly"
+            input_split = torch.tensor_split(input_data, num_cards, dim=0)
+            for card_index in range(num_cards):
+                containers[card_index] = input_split[card_index]
         
         elif isinstance(input_data, (list, tuple)):
             for data in input_data:
-                new_containers = [[] for _ in range(num_devices)]
+                new_containers = [[] for _ in range(num_cards)]
                 _split_tensors(data, new_containers)
-                for device_index in range(num_devices):
-                    containers[device_index].append(new_containers[device_index])
+                for card_index in range(num_cards):
+                    containers[card_index].append(new_containers[card_index])
             
         else:
             raise TypeError("Input data should contain list, tuple or torch tensor only")
     
-    _split_tensors(input_data, inputs_per_device)
-    return inputs_per_device
+    _split_tensors(input_data, inputs_per_card)
+    return inputs_per_card
     
 def detach_all_tensors(data):
     if isinstance(data, torch.Tensor):
@@ -644,93 +585,38 @@ def detach_all_tensors(data):
         raise TypeError("Input data should contain list or torch tensor only")
     
     return data
-
-def compile_and_save_tti(
-    module,
-    arch: pybuda.BackendDevice,
-    device_id: int,
-    tti_output_path: str,
-    sample_inputs,
-    training: Optional[bool] = False,
-):
-    tt0 = pybuda.TTDevice(
-        "tt0", 
-        chip_ids=[device_id],
-        arch=arch,
-        devtype=pybuda.BackendType.Silicon
-    )
-    tt0.place_module(module)
-    tt0.compile_to_image(
-        img_path=tti_output_path,
-        training=training,
-        sample_inputs=sample_inputs,
-    )
-    
-def initialize_tti_image(
-    arch: pybuda.BackendDevice,
-    first_device_id: int,
-    single_device_compile_inputs: Iterable[torch.Tensor],
-    output_dir: str,
-    module: Optional["Module"] = None, 
-    precompiled_tti_path: Optional[str] = None,
-):
-    if precompiled_tti_path is None:
-        # Compile a tti on the fly if no precompiled tti is provided
-        # chip_ids are arbitrary, we will override it later when running the module
-        # fork a new process to compile the tti so we don't contaminate the state of the main process
-        image_path = os.path.join(output_dir, OUTPUT_TTI_NAME)
-        compile_and_save_tti(
-            module=module,
-            arch=arch,
-            device_id=first_device_id,
-            tti_output_path=image_path,
-            sample_inputs=single_device_compile_inputs,
-        )
-        # Clear the compile configs populated when compiling the tti
-        pybuda_reset()
-    else:
-        # If a precompiled tti is provided
-        # copy it over to the output directory if it isn't already there
-        precompiled_tti_path = os.path.realpath(precompiled_tti_path)
-        precompiled_tti_name = os.path.basename(precompiled_tti_path)
-        image_path = os.path.join(output_dir, precompiled_tti_name)
-        if os.path.abspath(precompiled_tti_path) != os.path.abspath(image_path):
-            shutil.copy(precompiled_tti_path, image_path)
-            
-    return image_path
             
 def run_tti_data_parallel(
     arch: pybuda.BackendDevice,
-    device_ids: Iterable[int],
+    device_ids: List[List[int]],
     run_mode: RunMode,
     inputs: Union[ForwardRunInputs, GenerativeRunInputs],
     sync_at_run_start: bool = False,
     rm_tmp_dirs: bool = True,
+    precompiled_tti_path: str = None,
     output_dir: str = "./device_images",
     num_loops: Optional[int] = None,
-    module: Optional["Module"] = None, 
-    precompiled_tti_path: Optional[str] = None,
 ) -> "RunResult":
     '''
-    User-facing API. Run a module/precompiled-tti on multiple devices in parallel.
+    User-facing API. Run a tti on multiple cards in parallel.
     Arguments: 
     - arch: Architecture of the devices.
-    - device_ids: List of device ids to run the module on, these device ids should all be mmio mapped.
+    - device_ids: List of device ids to run the tti on, each sublist should start with mmio-mapped device id.
     - run_mode: Mode to run on. Currently supports forward and generative runs.
     - inputs: List of inputs to run the tti on.
     - sync_at_run_start: If True, the processes will wait until all processes are ready to run before starting the run.
-    - rm_tmp_dirs: If True, remove all temporary directories created for each device.
+    - rm_tmp_dirs: If True, remove all temporary directories created for each card.
+    - precompiled_tti_path: Path to a precompiled tti image to run on the cards.
     - output_dir: Directory to store the ttis as well as the unzipped tti directories. If it doesn't exist, one will be created.
         If precompiled_tti_path is provided, the tti will be copied to this directory.
-    - num_loops: Number of loops to run the module. For generative runs, this will be hardcoded to 1.
-    - module: Module to be compiled as a tti and run on the devices, must be provided if precompiled_tti_path is not.
-    - precompiled_tti_path: Path to a precompiled tti image to run on the devices, must be provided if module is not.
+    - num_loops: Number of loops to run the tti. For generative runs, this will be hardcoded to 1.
     Returns:
-    - RunResult object containing the merged outputs and start/end times of the run on each device.
+    - RunResult object containing the merged outputs and start/end times of the run on each card.
     '''
     assert arch in [pybuda.BackendDevice.Wormhole_B0, pybuda.BackendDevice.Grayskull], "Unsupported device architecture"
-    assert module or precompiled_tti_path, "Either a module or a precompiled tti path must be provided"
-    assert not (module and precompiled_tti_path), "Only one of module or precompiled tti path should be provided"
+    assert precompiled_tti_path
+    if len(device_ids[0]) > 1:
+        assert os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1", "Only support multi-device override in N300 data parallel mode"
     
     if arch == pybuda.BackendDevice.Wormhole_B0 and os.environ.get("PYBUDA_FORCE_THREADS", "0") != "1":
         logger.warning("PYBUDA_FORCE_THREADS is not set, this may cause errors when running on multiple devices due to parallel execution of create-ethernet-map")
@@ -739,59 +625,42 @@ def run_tti_data_parallel(
     
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-        
+    
+    image_path = _initialize_tti_image(
+        output_dir=output_dir,
+        precompiled_tti_path=precompiled_tti_path,
+    )
+    
     if run_mode == RunMode.FORWARD:
         assert isinstance(inputs, ForwardRunInputs)
-        inputs_per_device = ForwardRunInputs.get_inputs_per_device(inputs, len(device_ids))
-        image_path = initialize_tti_image(
-            arch=arch,
-            first_device_id=device_ids[0],
-            single_device_compile_inputs=inputs_per_device[0].inputs,
-            output_dir=output_dir,
-            module=module,
-            precompiled_tti_path=precompiled_tti_path,
-        )
+        inputs_per_card = ForwardRunInputs.get_inputs_per_card(inputs, len(device_ids))
         configs: List[ForwardRunConfig] = [
             ForwardRunConfig(
-                chip_id = device,
-                inputs = inputs_per_device[device_index],
+                chip_ids=devices,
+                inputs=inputs_per_card[card],
                 tti_path=image_path,
                 loop_count=num_loops,
-            ) for device, device_index in enumerate(device_ids)
+            ) for card, devices in enumerate(device_ids)
         ]
-        run_result: RunResult = ForwardRun.run(
-            configs=configs,
-            output_dir=output_dir,
-            sync_at_run_start=sync_at_run_start,
-            rm_tmp_dirs=rm_tmp_dirs,
-        )
         
     elif run_mode == RunMode.GENERATIVE:
         assert isinstance(inputs, GenerativeRunInputs)
-        inputs_per_device = GenerativeRunInputs.get_inputs_per_device(inputs, len(device_ids))          
-        image_path = initialize_tti_image(
-            arch=arch,
-            first_device_id=device_ids[0],
-            single_device_compile_inputs=inputs_per_device[0].compile_inputs,
+        inputs_per_card = GenerativeRunInputs.get_inputs_per_card(inputs, len(device_ids))          
+        image_path = _initialize_tti_image(
             output_dir=output_dir,
-            module=module,
             precompiled_tti_path=precompiled_tti_path,
         )
         configs: List[GenerativeRunConfig] = [
             GenerativeRunConfig(
-                chip_id = device,
-                inputs = inputs_per_device[device_index],
+                chip_ids=devices,
+                inputs=inputs_per_card[card],
                 tti_path=image_path,
-            ) for device, device_index in enumerate(device_ids)
+            ) for card, devices in enumerate(device_ids)
         ]
-        run_result: RunResult = GenerativeRun.run(
-            configs=configs,
-            output_dir=output_dir,
-            sync_at_run_start=sync_at_run_start,
-            rm_tmp_dirs=rm_tmp_dirs
-        )
         
     else:
         raise TypeError("Invalid run mode provided. Supported modes are FORWARD and GENERATIVE.")
     
+    run_result: RunResult = _run(run_mode=run_mode, configs=configs, output_dir=output_dir, sync_at_run_start=sync_at_run_start, rm_tmp_dirs=rm_tmp_dirs)
+    
     return run_result
\ No newline at end of file
diff --git a/pybuda/pybuda/ttdevice.py b/pybuda/pybuda/ttdevice.py
index c11d9891..67b9c5f9 100644
--- a/pybuda/pybuda/ttdevice.py
+++ b/pybuda/pybuda/ttdevice.py
@@ -1551,13 +1551,13 @@ def load_image(
         *, 
         img: Optional["TTDeviceImage"] = None, 
         img_path: Optional[str] = None, 
-        device_id_override: Optional[int] = None
+        device_id_overrides: Optional[List[int]] = None
     ) -> "TTDevice":
         from .tti import TTDeviceImage
         if img and img_path:
             logger.error("only one of image/image-path should be specified")
         if img is None:
-            img = TTDeviceImage.load_from_disk(img_path, device_id_override)
+            img = TTDeviceImage.load_from_disk(img_path, device_id_overrides)
         return TTDeviceImage.create_device_from_image(img)
 
 
diff --git a/pybuda/pybuda/tti/archive.py b/pybuda/pybuda/tti/archive.py
index fc9b10e7..5048c597 100644
--- a/pybuda/pybuda/tti/archive.py
+++ b/pybuda/pybuda/tti/archive.py
@@ -31,7 +31,7 @@
 import torch
 import json
 import pickle
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Match
 from pybuda.optimizers import Optimizer
 from pybuda.backend import BackendAPI
 from pybuda._C.backend_api import (
@@ -41,6 +41,7 @@
     binarize_tensor,
     debinarize_tensor,
     tilize_tensor,
+    get_device_cluster_yaml,
 )
 from pybuda._C import DataFormat
 
@@ -331,66 +332,190 @@ def _get_device_img_path(device_img_path_override: Optional[str] = None):
             device_img_path = DEFAULT_DEVICE_PATH
         return device_img_path
     
+    @staticmethod 
+    def _device_override_str(device_id_overrides: List[int]) -> str:
+        return "_".join([str(device_id) for device_id in device_id_overrides])
+    
     @staticmethod
-    def _get_override_netlist_path(original_netlist_path: str, device_id_override: int) -> str:
-        return f"{os.path.splitext(original_netlist_path)[0]}_override_device_{device_id_override}.yaml"
+    def _get_override_netlist_path(original_netlist_path: str, device_id_overrides: List[int]) -> str:
+        device_override_str = TTIArchive._device_override_str(device_id_overrides)
+        return f"{os.path.splitext(original_netlist_path)[0]}_override_device_{device_override_str}.yaml"
     
     @staticmethod
-    def _get_override_backend_output_dir(oringal_binaries_path: str, device_id_override: int) -> str:
-        return f"{oringal_binaries_path}_override_device_{device_id_override}"
+    def _get_override_backend_output_path(oringal_binaries_path: str, device_id_overrides: List[int]) -> str:
+        device_override_str = TTIArchive._device_override_str(device_id_overrides)
+        return f"{oringal_binaries_path}_override_device_{device_override_str}"
     
     @staticmethod
-    def _create_device_override_netlist_yaml(original_netlist_path: str, device_id_override: int) -> str:
-        # Can't use yaml library here, netlist needs to be in specific format
-        target_device_pattern = r'target_device:\s*\d+'
-        new_netlist_path = TTIArchive._get_override_netlist_path(original_netlist_path, device_id_override)
+    def _get_original_device_to_new_device_map(netlist_file_path: str, device_id_overrides: List[int]):
+        with open(netlist_file_path, "r") as netlist_file:
+            netlist_str = netlist_file.read()
         
-        if os.path.exists(new_netlist_path):
-            return new_netlist_path
+        single_target_device_pattern = re.compile(r'\btarget_device:\s*(\d+)\b')
+        multi_target_device_pattern = re.compile(r'target_device:\s*\[(\d+),\s*(\d+)\]')
         
-        with open(original_netlist_path, "r") as netlist_file:
-            netlist_str = netlist_file.read()
-            
-        netlist_str_override = re.sub(target_device_pattern, f"target_device: {device_id_override}", netlist_str)
-        with open(new_netlist_path, "w") as new_netlist_file:
-            new_netlist_file.write(netlist_str_override)
+        if len(device_id_overrides) == 1:
+            m = single_target_device_pattern.search(netlist_str)
+            assert m, "Expected single target_device in netlist"
+            original_device = int(m.group(1))
+            return {original_device: device_id_overrides[0]}
+
+        m = multi_target_device_pattern.search(netlist_str)
+        assert m, "Expected multi-target_device in netlist"
+        original_devices = (int(m.group(1)), int(m.group(2)))
+        return {old_device: new_device for old_device, new_device in zip(original_devices, device_id_overrides)}
+    
+    @staticmethod
+    def _update_n300_dp_trisc_firmware_directories(
+        netlist_path: str, 
+        old_device_to_new_device_map: Dict[int, int]
+    ) -> None:
+        override_backend_outdir = os.path.dirname(netlist_path)
+        with open(netlist_path, "r") as netlist_file:
+            netlist_map = yaml.safe_load(netlist_file)
             
-        return new_netlist_path
+        device_suffix_pattern = re.compile(r'\.(\d+)$')
+        all_graphs = list(netlist_map["graphs"].keys())
+        
+        temp_sub_dirs = []
+        for sub_dir in os.listdir(override_backend_outdir):
+            m = device_suffix_pattern.search(sub_dir)
+            # Locate trisc firmware directories
+            if m and any([graph in sub_dir for graph in all_graphs]):
+                old_device_id = int(m.group(1))
+                new_device_id = old_device_to_new_device_map[old_device_id]
+                # temp name to prevent name collisions
+                temp_sub_dir = device_suffix_pattern.sub(f".{new_device_id}_temp", sub_dir)
+                temp_sub_dirs.append(temp_sub_dir)
+                os.rename(os.path.join(override_backend_outdir, sub_dir), os.path.join(override_backend_outdir, temp_sub_dir))
+        
+        # Finalize temporary directory names
+        device_suffix_pattern_temp = re.compile(r'\.(\d+)_temp$')
+        for temp_sub_dir in temp_sub_dirs:
+            new_sub_dir = device_suffix_pattern_temp.sub(r".\1", temp_sub_dir)
+            os.rename(os.path.join(override_backend_outdir, temp_sub_dir), os.path.join(override_backend_outdir, new_sub_dir))
+        
+    @staticmethod
+    def _update_n300_dp_nops_in_netlist_string(netlist_str: str, old_device_to_new_device_map: Dict[int, int]) -> str:
+        netlist_map = yaml.safe_load(netlist_str)
+        dp_nop_pattern = re.compile(r'dp_nop\.(\d+)$')
+        device_suffix_pattern = re.compile(r'\.(\d+)$')
+        
+        def override_handler(m: Match, old_device_to_new_device_map: Dict[int, int]):
+            matched_string = m.group(0)
+            old_device_id = int(device_suffix_pattern.search(matched_string).group(1))
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            return device_suffix_pattern.sub(f".{new_device_id}", matched_string)
+
+        fields_to_override = set()
+        graphs_map = netlist_map["graphs"]
+        
+        for graph_name, ops_map in graphs_map.items():
+            for op_name, op_configs in ops_map.items():
+                if not dp_nop_pattern.search(op_name):
+                    continue
+                fields_to_override.add(op_name)
+                op_inputs = op_configs["inputs"]
+                for op_input in op_inputs:
+                    if not device_suffix_pattern.search(op_input):
+                        continue
+                    fields_to_override.add(op_input)
+                
+        new_netlist_str = re.sub("|".join(fields_to_override), lambda m: override_handler(m, old_device_to_new_device_map), netlist_str)
+        return new_netlist_str
     
+    @staticmethod
+    def _update_n300_dp_compiled_graph_state(
+        compiled_graph_state: "CompiledGraphState",
+        old_device_to_new_device_map: Dict[int, int],
+    ) -> None:
+        device_suffix_pattern = re.compile(r'\.(\d+)$')
+        def update_device_id_suffix(items: Union[Dict[str, str], List[str]]):
+            assert isinstance(items, (dict, list)), "Expected items to be a dict or list"
+            if isinstance(items, dict):
+                updated_items: Dict[str, str] = {}
+                for k, v in items.items():
+                    old_device_id = int(device_suffix_pattern.search(k).group(1))
+                    new_device_id = old_device_to_new_device_map[old_device_id]
+                    updated_items[device_suffix_pattern.sub(f".{new_device_id}", k)] = v
+                return updated_items
+            else:
+                updated_items: List[str] = []
+                for item in items:
+                    old_device_id = int(device_suffix_pattern.search(item).group(1))
+                    new_device_id = old_device_to_new_device_map[old_device_id]
+                    updated_items.append(device_suffix_pattern.sub(f".{new_device_id}", item))
+                return updated_items
+
+        compiled_graph_state.input_to_tile_dims = update_device_id_suffix(compiled_graph_state.input_to_tile_dims)
+
+        compiled_graph_state.post_const_eval_constants = update_device_id_suffix(compiled_graph_state.post_const_eval_constants)
+        compiled_graph_state.post_const_eval_parameters = update_device_id_suffix(compiled_graph_state.post_const_eval_parameters)
+
+        compiled_graph_state.ordered_constant_node_names = update_device_id_suffix(compiled_graph_state.ordered_constant_node_names)
+        compiled_graph_state.ordered_parameter_node_names = update_device_id_suffix(compiled_graph_state.ordered_parameter_node_names)
+
+        compiled_graph_state.ordered_input_names = update_device_id_suffix(compiled_graph_state.ordered_input_names)
+        compiled_graph_state.ordered_output_names = update_device_id_suffix(compiled_graph_state.ordered_output_names)
+        
     # Keep consistent with epoch_loader.cpp::update_overlay_binary
     @staticmethod
     def _update_overlay_binary_hex_filenames(
         override_backend_output_dir: str,
-        device_id_override: int
-    ):
+        old_device_to_new_device_map: Dict[int, int],
+    ) -> None:
+        def rename_blob_file_temp(m: Match, old_device_to_new_device_map: Dict[int, int]):
+            old_device_id = int(m.group(2))
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            return f'pipegen_epoch{m.group(1)}_{new_device_id}_{m.group(3)}_{m.group(4)}_temp.hex'
+        
+        def rename_blob_file_final(m: Match):
+            return f'pipegen_epoch{m.group(1)}_{m.group(2)}_{m.group(3)}_{m.group(4)}.hex'
+        
         temporal_epoch_dir_re = re.compile(r"^temporal_epoch_\d+$")
         # (temporal_epoch)_(chip_id)_(route_r)_(route_c)
         overlay_blob_hex_re = re.compile(r"^pipegen_epoch(\d+)_(\d+)_(\d+)_(\d+).hex$")
-        substitution = r'pipegen_epoch\1_' + str(device_id_override) + r'_\3_\4.hex'
+        temp_overlay_blob_hex_re = re.compile(r"^pipegen_epoch(\d+)_(\d+)_(\d+)_(\d+)_temp.hex$")
         
         temporal_epoch_dirs = [os.path.join(override_backend_output_dir, epoch_dir) for epoch_dir in os.listdir(override_backend_output_dir) if temporal_epoch_dir_re.match(epoch_dir)]
         for temporal_epoch_dir in temporal_epoch_dirs:
             blobs_dir = os.path.join(temporal_epoch_dir, "overlay", "blobs")
             if not os.path.isdir(blobs_dir):
                 continue
-            blobs_dir = os.path.join(temporal_epoch_dir, "overlay", "blobs")
-            for blob_hex in os.listdir(blobs_dir):
-                if overlay_blob_hex_re.match(blob_hex):
-                    new_blob_hex = re.sub(overlay_blob_hex_re, substitution, blob_hex)
-                    os.rename(os.path.join(blobs_dir, blob_hex), os.path.join(blobs_dir, new_blob_hex))
+            temp_blob_files: List[str] = []
+            for blob_hex_name in os.listdir(blobs_dir):
+                # Rename blob files to a temporary name to prevent name collisions
+                if overlay_blob_hex_re.match(blob_hex_name):
+                    temp_blob_hex_name = overlay_blob_hex_re.sub(lambda m: rename_blob_file_temp(m, old_device_to_new_device_map), blob_hex_name)
+                    os.rename(os.path.join(blobs_dir, blob_hex_name), os.path.join(blobs_dir, temp_blob_hex_name))
+                    temp_blob_files.append(temp_blob_hex_name)
+            
+            # After all device ids have been updated, rename the temporary blob files to the final name
+            for temp_blob_file in temp_blob_files:
+                final_blob_hex_name = temp_overlay_blob_hex_re.sub(rename_blob_file_final, temp_blob_file)
+                os.rename(os.path.join(blobs_dir, temp_blob_file), os.path.join(blobs_dir, final_blob_hex_name))
 
     # Keep consistent with epoch_loader.cpp::populate_queue_to_core_map_from_net2pipe
     @staticmethod
     def _update_producer_consumer_queue_yaml(
         override_backend_output_dir: str,
-        device_id_override: int
-    ):
-        temporal_epoch_dir_re = re.compile(r"^temporal_epoch_\d+$")
-        chip_id_pattern = r'chip_id:\s*\d+'
-        queue_target_device_pattern = r'queue_target_device:\s*\d+'
+        old_device_to_new_device_map: Dict[int, int],
+    ) -> None:
+        
+        def override_queue_target_device(m: Match, old_device_to_new_device_map: Dict[int, int]):
+            old_device_id = int(m.group(1))
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            return f"queue_target_device: {new_device_id}"
+        
+        def override_chip_id(m: Match, old_device_to_new_device_map: Dict[int, int]):
+            old_device_id = int(m.group(1))
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            return f"chip_id: {new_device_id}"
         
-        chip_id_override = f"chip_id: {device_id_override}"
-        queue_target_device_override = f"queue_target_device: {device_id_override}"
+        temporal_epoch_dir_re = re.compile(r"^temporal_epoch_\d+$")
+        chip_id_pattern = re.compile(r'chip_id:\s*(\d+)')
+        queue_target_device_pattern = re.compile(r'queue_target_device:\s*(\d+)')
+
         temporal_epoch_dirs = [os.path.join(override_backend_output_dir, epoch_dir) for epoch_dir in os.listdir(override_backend_output_dir) if temporal_epoch_dir_re.match(epoch_dir)]
         
         for temporal_epoch_dir in temporal_epoch_dirs:
@@ -401,8 +526,8 @@ def _update_producer_consumer_queue_yaml(
                 with open(queue_to_consumer_path, "r") as old_q_consumer_file:
                     queue_to_consumer_str = old_q_consumer_file.read()
                     
-                queue_to_consumer_str_override = re.sub(chip_id_pattern, chip_id_override, queue_to_consumer_str)
-                queue_to_consumer_str_override = re.sub(queue_target_device_pattern, queue_target_device_override, queue_to_consumer_str_override)
+                queue_to_consumer_str_override = chip_id_pattern.sub(lambda m: override_chip_id(m, old_device_to_new_device_map), queue_to_consumer_str)
+                queue_to_consumer_str_override = queue_target_device_pattern.sub(lambda m: override_queue_target_device(m, old_device_to_new_device_map), queue_to_consumer_str_override)
                 
                 with open(queue_to_consumer_path, "w") as new_q_consumer_file:
                     new_q_consumer_file.write(queue_to_consumer_str_override)
@@ -411,8 +536,8 @@ def _update_producer_consumer_queue_yaml(
                 with open(queue_to_producer_path, "r") as old_q_producer_file:
                     queue_to_producer_str = old_q_producer_file.read()
                     
-                queue_to_producer_str_override = re.sub(chip_id_pattern, chip_id_override, queue_to_producer_str)
-                queue_to_producer_str_override = re.sub(queue_target_device_pattern, queue_target_device_override, queue_to_producer_str_override)
+                queue_to_producer_str_override = chip_id_pattern.sub(lambda m: override_chip_id(m, old_device_to_new_device_map), queue_to_producer_str)
+                queue_to_producer_str_override = queue_target_device_pattern.sub(lambda m: override_queue_target_device(m, old_device_to_new_device_map), queue_to_producer_str_override)
                 
                 with open(queue_to_producer_path, "w") as new_q_producer_file:
                     new_q_producer_file.write(queue_to_producer_str_override)
@@ -420,51 +545,108 @@ def _update_producer_consumer_queue_yaml(
     @staticmethod
     def _update_overlay_blob_dir_with_override_device_id(
         override_backend_output_dir: str,
-        device_id_override: int
-    ):
+        old_device_to_new_device_map: Dict[int, int],
+    ) -> None:
         # Update chip_id in overlay blob hex file names
-        TTIArchive._update_overlay_binary_hex_filenames(override_backend_output_dir, device_id_override)
+        TTIArchive._update_overlay_binary_hex_filenames(override_backend_output_dir, old_device_to_new_device_map)
         
         # Update device id in queue_to_consumer.yaml and queue_to_producer.yaml
-        TTIArchive._update_producer_consumer_queue_yaml(override_backend_output_dir, device_id_override)
+        TTIArchive._update_producer_consumer_queue_yaml(override_backend_output_dir, old_device_to_new_device_map)
+        
+
+    @staticmethod
+    def _create_device_override_netlist_yaml(original_netlist_path: str, device_id_overrides: List[int]) -> str:
+        
+        def replace_target_device_with_new_device(m: Match, old_device_to_new_device_map: Dict[int, int]):
+            old_device = int(m.group(1))
+            new_device = old_device_to_new_device_map[old_device]
+            return f"target_device: {new_device}"
+        
+        single_device_pattern = re.compile(r'\btarget_device:\s*(\d+)\b')
+        # Can't use yaml library here, netlist needs to be in specific format
+        new_netlist_path = TTIArchive._get_override_netlist_path(original_netlist_path, device_id_overrides)
+        
+        if os.path.exists(new_netlist_path):
+            return new_netlist_path
+        
+        with open(original_netlist_path, "r") as netlist_file:
+            netlist_str_override = netlist_file.read()
+        
+        # Create a map that maps original device to new device
+        old_device_to_new_device_map = TTIArchive._get_original_device_to_new_device_map(original_netlist_path, device_id_overrides)
+        
+        # Override single device ids with its new device equivalent
+        netlist_str_override = single_device_pattern.sub(lambda m: replace_target_device_with_new_device(m, old_device_to_new_device_map), netlist_str_override)
+        
+        if len(device_id_overrides) > 1:
+            assert os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1", "Only support multi-device override in N300 data parallel mode"
+            assert len(device_id_overrides) == 2, "Only support 1 or 2 device overrides"
+            multi_target_device_pattern = re.compile(r'target_device:\s*\[(\d+),\s*(\d+)\]')
+            
+            # Override multi device arrays with new device ids
+            netlist_str_override = multi_target_device_pattern.sub(rf"target_device: {device_id_overrides}", netlist_str_override)
+            netlist_str_override = TTIArchive._update_n300_dp_nops_in_netlist_string(netlist_str_override, old_device_to_new_device_map)
+            
+        with open(new_netlist_path, "w") as new_netlist_file:
+            new_netlist_file.write(netlist_str_override)
+            
+        return new_netlist_path
+    
+    @staticmethod
+    def _update_cluster_desc_yaml(cluster_desc_path: str) -> str:
+        if os.path.exists(cluster_desc_path):
+            os.remove(cluster_desc_path)
+        cluster_output_dir = os.path.dirname(cluster_desc_path)
+        new_cluster_descriptor_path = get_device_cluster_yaml(cluster_output_dir)
+        return new_cluster_descriptor_path
     
     @staticmethod
     def _update_runtime_data_yaml_with_override_device_id(
         new_backend_output_dir: str,
-        device_id_override: int
-    ):
+        old_device_to_new_device_map: Dict[int, int],
+    ) -> None:
+        device_id_overrides = list(old_device_to_new_device_map.values())
+        
         # Update runtime data yaml
         new_runtime_yaml_path = os.path.join(new_backend_output_dir, "runtime_data.yaml")
         with open(new_runtime_yaml_path, "r") as f:
             new_runtime_data = yaml.safe_load(f)
             
         # Update worker_grid_sizes_per_chip
-        num_devices = len(new_runtime_data["worker_grid_sizes_per_chip"].keys())
-        assert num_devices == 1, f"Unexpected TTI not compiled on single device: {num_devices} devices in worker_grid_sizes_per_chip"
-        if device_id_override not in new_runtime_data["worker_grid_sizes_per_chip"]:
-            original_device_id = list(new_runtime_data["worker_grid_sizes_per_chip"].keys()).pop()
-            new_runtime_data["worker_grid_sizes_per_chip"][device_id_override] = new_runtime_data["worker_grid_sizes_per_chip"].pop(original_device_id)
+        old_worker_grid_sizes_per_chip = new_runtime_data["worker_grid_sizes_per_chip"]
+        new_worker_grid_sizes_per_chip = {}
+        assert len(old_worker_grid_sizes_per_chip) == len(device_id_overrides), f"Num devices mismatch between runtime data worker_grid_sizes_per_chip and devices to override"
+        
+        for old_device_id in old_worker_grid_sizes_per_chip:
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            new_worker_grid_sizes_per_chip[new_device_id] = old_worker_grid_sizes_per_chip[old_device_id]
+            
+        new_runtime_data["worker_grid_sizes_per_chip"] = new_worker_grid_sizes_per_chip
             
         # Update harvested_rows_per_chip
-        # Set the harvesting mask to be the same as what we used for the original device id
+        # Set the harvesting mask to be the same as what we used for the original device ids
         # If the grid size is not the same as what runtime detects during run
         # runtime will use the actual harvesting mask, and overlay will be recompiled implicitly by runtime
-        num_devices = len(new_runtime_data["harvested_rows_per_chip"].keys())
-        assert num_devices == 1, f"Unexpected TTI not compiled on single device: {num_devices} devices in harvested_rows_per_chip"
-        if device_id_override not in new_runtime_data["harvested_rows_per_chip"]:
-            original_device_id = list(new_runtime_data["harvested_rows_per_chip"].keys()).pop()
-            new_runtime_data["harvested_rows_per_chip"][device_id_override] = new_runtime_data["harvested_rows_per_chip"].pop(original_device_id)
+        old_harvested_rows_per_chip = new_runtime_data["harvested_rows_per_chip"]
+        new_harvested_rows_per_chip = {}
+        assert len(old_harvested_rows_per_chip) == len(device_id_overrides), f"Num devices mismatch between runtime data harvested_rows_per_chip and devices to override"
+        
+        for old_device_id in old_harvested_rows_per_chip:
+            new_device_id = old_device_to_new_device_map[old_device_id]
+            new_harvested_rows_per_chip[new_device_id] = old_harvested_rows_per_chip[old_device_id]
+            
+        new_runtime_data["harvested_rows_per_chip"] = new_harvested_rows_per_chip
         
         with open(new_runtime_yaml_path, 'w') as f:
             yaml.safe_dump(new_runtime_data, f)
-            
+    
     @staticmethod
     def _create_device_override_backend_output_dir(
         original_backend_output_dir: str,
         original_netlist_path: str,
-        device_id_override: int
+        device_id_overrides: List[int]
     ) -> str:
-        new_backend_output_dir = TTIArchive._get_override_backend_output_dir(original_backend_output_dir, device_id_override)
+        new_backend_output_dir = TTIArchive._get_override_backend_output_path(original_backend_output_dir, device_id_overrides)
         if os.path.exists(new_backend_output_dir):
             logger.info("TTDeviceImage: Using existing device override binaries directory {}", new_backend_output_dir)
             return new_backend_output_dir
@@ -475,15 +657,26 @@ def _create_device_override_backend_output_dir(
         # Remove the original netlist and copy over the override netlist to the new binaries directory
         original_netlist_name = os.path.basename(original_netlist_path)
         os.remove(os.path.join(new_backend_output_dir, original_netlist_name))
-        override_netlist_path = TTIArchive._get_override_netlist_path(original_netlist_path, device_id_override)
+        
+        # Path to the netlist file override directly under unzipped_tti directory
+        override_netlist_path = TTIArchive._get_override_netlist_path(original_netlist_path, device_id_overrides)
         override_netlist_name = os.path.basename(override_netlist_path)
-        TTIArchive._copy_netlist_yaml(netlist_yaml=override_netlist_path, dst_dir=os.path.join(new_backend_output_dir, override_netlist_name))
+        
+        override_netlist_path_in_backend_outdir = os.path.join(new_backend_output_dir, override_netlist_name)
+        
+        TTIArchive._copy_netlist_yaml(netlist_yaml=override_netlist_path, dst_dir=override_netlist_path_in_backend_outdir)
 
+        old_device_to_new_device_map = TTIArchive._get_original_device_to_new_device_map(original_netlist_path, device_id_overrides)
+        
         # Update runtime data yaml
-        TTIArchive._update_runtime_data_yaml_with_override_device_id(new_backend_output_dir, device_id_override)
+        TTIArchive._update_runtime_data_yaml_with_override_device_id(new_backend_output_dir, old_device_to_new_device_map)
         
         # Update relative files in the overlay output directories
-        TTIArchive._update_overlay_blob_dir_with_override_device_id(new_backend_output_dir, device_id_override)
+        TTIArchive._update_overlay_blob_dir_with_override_device_id(new_backend_output_dir, old_device_to_new_device_map)
+        
+        if os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1":
+            # Update device id suffix in trisc firmware directories
+            TTIArchive._update_n300_dp_trisc_firmware_directories(override_netlist_path_in_backend_outdir, old_device_to_new_device_map)
 
         return new_backend_output_dir
     
@@ -520,7 +713,7 @@ def get_instantiate_modules(
         return instantiated_modules
 
     @staticmethod
-    def construct_device_image(unzipped_tti_directory: str, device_id_override: Optional[int] = None) -> "TTDeviceImage":
+    def construct_device_image(unzipped_tti_directory: str, device_id_overrides: Optional[List[int]] = None) -> "TTDeviceImage":
         from .tti import TTDeviceImage
 
         device_image = None
@@ -528,6 +721,7 @@ def construct_device_image(unzipped_tti_directory: str, device_id_override: Opti
             os.path.join(unzipped_tti_directory, "device.json"), "r"
         ) as json_file:
             device_image_dict = json.load(json_file, cls=TTDeviceImageJsonDecoder)
+                
             TTDeviceImageJsonDecoder.postprocess_keys(
                 device_image_dict, unzipped_tti_directory
             )
@@ -539,9 +733,14 @@ def construct_device_image(unzipped_tti_directory: str, device_id_override: Opti
                     f"TTI failed to deserialize. TTDeviceImage not contain key: {e}. TTI recompilation required."
                 )
 
-            if device_id_override:
-                assert len(device_image.chip_ids) == 1, "Cannot override multi-device TTI image with single device"
-                device_image.chip_ids = [device_id_override]
+            if device_id_overrides is not None:
+                if os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1" or len(device_id_overrides) == 1:
+                    # In n300 data parallel mode, the chip_ids of the device image is of length 1 as well
+                    assert len(device_image.chip_ids) == 1, "Cannot override multi-device TTI image with single device"
+                    device_image.chip_ids = [device_id_overrides[0]]
+                    
+                else:
+                    device_image.chip_ids = device_id_overrides
                 
             sys.path.append(
                 "."
@@ -560,16 +759,20 @@ def construct_device_image(unzipped_tti_directory: str, device_id_override: Opti
             
             netlist_file_path = original_netlist_file_path
             
-            if device_id_override is not None:
-                netlist_file_path = TTIArchive._create_device_override_netlist_yaml(netlist_file_path, device_id_override)
+            if device_id_overrides is not None:
+                netlist_file_path = TTIArchive._create_device_override_netlist_yaml(netlist_file_path, device_id_overrides)
             
             device_image.compiled_graph_state.netlist_filename = netlist_file_path
 
-                
+            # Update the device image compiled_graph_state with the new device ids
+            if device_id_overrides is not None and os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1":
+                old_device_to_new_device_map = TTIArchive._get_original_device_to_new_device_map(original_netlist_file_path, device_id_overrides)
+                TTIArchive._update_n300_dp_compiled_graph_state(device_image.compiled_graph_state, old_device_to_new_device_map)
+                    
         return device_image, original_netlist_file_path
 
     @staticmethod
-    def load_from_disk(tti_file_path: str, device_id_override: Optional[int] = None) -> "TTDeviceImage":
+    def load_from_disk(tti_file_path: str, device_id_overrides: Optional[List[int]] = None) -> "TTDeviceImage":
         tti_file_path = TTIArchive._get_device_img_path(tti_file_path)
         absolute_device_image_path = os.path.realpath(tti_file_path)
         logger.info("TTDeviceImage::loading from {}", absolute_device_image_path)
@@ -585,7 +788,9 @@ def contains_matching_checksum(tti_checksum) -> bool:
             return tti_checksum == directory_checksum
 
         tti_checksum = compute_file_checksum(absolute_device_image_path)
-        if contains_matching_checksum(tti_checksum):
+        found_matching_checksum = contains_matching_checksum(tti_checksum)
+        
+        if found_matching_checksum:
             logger.info(
                 f"TTI: Netlist checksum matches - populating TTDevice from pre-existing dir {unzipped_tti_directory}"
             )
@@ -604,17 +809,29 @@ def contains_matching_checksum(tti_checksum) -> bool:
                 tti_checksum, os.path.join(unzipped_tti_directory, "checksum.txt")
             )
 
-        device_image, original_netlist_file_path = TTIArchive.construct_device_image(unzipped_tti_directory, device_id_override)
+        device_image, original_netlist_file_path = TTIArchive.construct_device_image(unzipped_tti_directory, device_id_overrides)
+            
+        if device_image.compiler_cfg.backend_cluster_descriptor_path:
+            device_image.compiler_cfg.backend_cluster_descriptor_path = os.path.join(
+                absolute_device_image_directory,
+                device_image.compiler_cfg.backend_cluster_descriptor_path
+            )
+            # If we unzipped the tti for the first time and we are overriding devices
+            # Regenerate the cluster descriptor
+            if device_id_overrides is not None and not found_matching_checksum:
+                new_cluster_desc_path = TTIArchive._update_cluster_desc_yaml(device_image.compiler_cfg.backend_cluster_descriptor_path)
+                device_image.compiler_cfg.backend_cluster_descriptor_path = new_cluster_desc_path
+            
         if device_image.compiler_cfg.backend_output_dir:
             device_image.compiler_cfg.backend_output_dir = os.path.join(
                 absolute_device_image_directory,
                 device_image.compiler_cfg.backend_output_dir,
             )
-            if device_id_override is not None:
+            if device_id_overrides is not None:
                 device_image.compiler_cfg.backend_output_dir = TTIArchive._create_device_override_backend_output_dir(
                     device_image.compiler_cfg.backend_output_dir,
                     original_netlist_file_path,
-                    device_id_override
+                    device_id_overrides,
                 )
 
         if device_image.compiler_cfg.backend_runtime_params_path:
@@ -629,11 +846,6 @@ def contains_matching_checksum(tti_checksum) -> bool:
                 device_image.compiler_cfg.backend_device_descriptor_path
             )
             
-        if device_image.compiler_cfg.backend_cluster_descriptor_path:
-            device_image.compiler_cfg.backend_cluster_descriptor_path = os.path.join(
-                absolute_device_image_directory,
-                device_image.compiler_cfg.backend_cluster_descriptor_path
-            )
             
         _set_global_compiler_config(device_image.compiler_cfg)
 
diff --git a/pybuda/pybuda/tti/tti.py b/pybuda/pybuda/tti/tti.py
index 99f6b4fb..0e2496bf 100644
--- a/pybuda/pybuda/tti/tti.py
+++ b/pybuda/pybuda/tti/tti.py
@@ -130,9 +130,9 @@ def create_image_from_device(
         return device_image
 
     @staticmethod
-    def load_from_disk(tti_file_path: str, device_id_override: Optional[int] = None) -> "TTDeviceImage":
+    def load_from_disk(tti_file_path: str, device_id_overrides: Optional[List[int]] = None) -> "TTDeviceImage":
         from .archive import TTIArchive
-        return TTIArchive.load_from_disk(tti_file_path, device_id_override)
+        return TTIArchive.load_from_disk(tti_file_path, device_id_overrides)
 
     @staticmethod
     def save_to_disk(
diff --git a/pybuda/test/benchmark/benchmark.py b/pybuda/test/benchmark/benchmark.py
index 9419b6a0..52c8e952 100755
--- a/pybuda/test/benchmark/benchmark.py
+++ b/pybuda/test/benchmark/benchmark.py
@@ -427,7 +427,7 @@ def run(
         assert not args.training, "Training not supported in parallel tti run"
         assert args.chips == 1, "Parallel TTI only supported for single chip models"
         assert len(device_list) > 0
-        device_ids = list(range(len(device_list)))
+        device_ids = [[i] for i in range(len(device_list))]
         precompiled_image_path = args.load_tti if args.load_tti else None
         tt_device = None if args.load_tti else tt
         
diff --git a/pybuda/test/benchmark/run_benchmark_tti_data_parallel b/pybuda/test/benchmark/run_benchmark_tti_data_parallel
index 703b1653..5a36dabd 100644
--- a/pybuda/test/benchmark/run_benchmark_tti_data_parallel
+++ b/pybuda/test/benchmark/run_benchmark_tti_data_parallel
@@ -1,6 +1,6 @@
 # emulate runs on harvested machines
 export PYBUDA_FORCE_EMULATE_HARVESTED=1
 unset PYBUDA_CI_DIR
-PYBUDA_FORCE_THREADS=1 pybuda/test/benchmark/benchmark.py -m bert -c tiny -opt 4 -o perf.json --env "PYBUDA_EXP_APPROX=1 PYBUDA_DISABLE_DYNAMIC_DRAM=1 PYBUDA_FORCE_INTERMED_TO_OUTPUT_DF=1" --auto_transpose --parallel_tti device_images/
-PYBUDA_FORCE_THREADS=1 pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 --env "PYBUDA_RIBBON2=1 PYBUDA_LEGACY_UBLOCK_SHAPE=1 PYBUDA_MAXIMIZE_SPARSE_UBLOCK=1 PYBUDA_ENABLE_L1_ACCUMULATE=1 PYBUDA_EXTRA_L1_MARGIN=65536 PYBUDA_FUSED_OP_MULTIPLIER=20 PYBUDA_ENABLE_DEPTHWISE=1" -o perf.json --auto_transpose --parallel_tti device_images/
+PYBUDA_FORCE_THREADS=1 pybuda/test/benchmark/benchmark.py -m bert -c tiny -opt 4 -o perf.json --auto_transpose --parallel_tti device_images/
+PYBUDA_FORCE_THREADS=1 pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 -o perf.json --auto_transpose --parallel_tti device_images/
 PYBUDA_FORCE_THREADS=1 pybuda/test/benchmark/benchmark.py -m flan_t5 -c base --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o perf.json --parallel_tti device_images/
diff --git a/pybuda/test/tti/test_tti_data_parallel.py b/pybuda/test/tti/test_tti_data_parallel.py
index a03e3612..41a8b5e3 100755
--- a/pybuda/test/tti/test_tti_data_parallel.py
+++ b/pybuda/test/tti/test_tti_data_parallel.py
@@ -2,15 +2,17 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
+from typing import Optional, List
+import shutil
 import pybuda
 import pybuda.backend
 import torch
 import os
-import pybuda
 import inspect
 from pybuda.pybudaglobal import pybuda_reset
 import numpy as np
 from pybuda.tools.tti_data_parallel import (
+    split_tensor_batch,
     run_tti_data_parallel, 
     RunMode,
     RunResult, 
@@ -67,14 +69,55 @@ def get_generative_params(other):
         
     return compile_inputs, num_tokens_to_generate, first_current_index, pad_token_id, write_index
 
-if __name__ == "__main__":
+def compile_and_save_tti(
+    module,
+    arch: pybuda.BackendDevice,
+    tti_output_path: str,
+    sample_inputs,
+    chip_ids: Optional[List[int]] = None,
+    num_chips: Optional[int] = None,
+):
+    tt0 = pybuda.TTDevice(
+        "tt0", 
+        module=module,
+        chip_ids=chip_ids,
+        num_chips=num_chips,
+        arch=arch
+    )
+    tt0.compile_to_image(
+        img_path=tti_output_path,
+        training=False,
+        sample_inputs=sample_inputs,
+    )
+    pybuda_reset()
+
+def get_model_config(base_kwargs, model, config):
+    models = get_models()
+    kwargs = base_kwargs.copy()
+    func = models[model]["func"]
+    available_parameters = inspect.signature(func).parameters
+    for p in available_parameters:
+        if p == "config":
+            kwargs["config"] = config
+        elif p == "force_num_layers":
+            kwargs["force_num_layers"] = 0
+
+    return func(**kwargs)
+
+
+def test_tti_mmio_dp_sanity():
+    clean_env = os.environ.copy()
     device_list = pybuda.detect_available_devices()
-    if not device_list:
-        raise RuntimeError("No devices available")
+    assert device_list, "No devices available"
+    
+    mmio_device_ids = [[0]]
+    arch = device_list[0]
+    num_loops = 16
+    total_microbatch_size = 128
     
     base_kwargs = {
         "training": False, 
-        "microbatch": 128, 
+        "microbatch": total_microbatch_size, 
         "data_type": 'Fp16_b',
         "math_fidelity": 'HiFi3',
         "arch": "wormhole_b0",
@@ -86,49 +129,99 @@ def get_generative_params(other):
         "bert": "base", 
         "mobilenet_v2": "224"
     }
-    
-    mmio_device_ids = list(range(len(device_list)))
-    arch = device_list[0]
-    
-    output_dir="device_images/"
+
+    output_dir = "device_images_multi_mmio/"
     os.makedirs(output_dir, exist_ok=True)
     
+    for model, config in model_to_config.items():
+        model_config = get_model_config(base_kwargs, model, config)
+        duts, inputs, targets, other = model_config
+        module = duts['tt']
+        image_path = os.path.join(output_dir, f"{model}.tti")
+        compile_and_save_tti(
+            module=module,
+            arch=arch,
+            chip_ids=[0],
+            tti_output_path=image_path,
+            sample_inputs=inputs,
+        )
+        run_result: RunResult = run_tti_data_parallel(
+            precompiled_tti_path=image_path,
+            run_mode=RunMode.FORWARD,
+            inputs=ForwardRunInputs(inputs=inputs),
+            arch=arch,
+            device_ids=mmio_device_ids,
+            num_loops=num_loops,
+            output_dir=output_dir,
+            sync_at_run_start=True
+        )
+        outputs = run_result.outputs
+        cpu_outputs = [module.cpu_eval_forward(*inputs)] * num_loops
+        check_outputs(cpu_outputs, outputs)
+        
+        pybuda_reset()
+        os.environ = clean_env
+    
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+
+# Sanity test that runs on a single card
+def test_tti_n300_dp_sanity():
+    clean_env = os.environ.copy()
+    device_list = pybuda.detect_available_devices()
+    assert device_list, "No devices available"
+    assert os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1"
     
+    device_ids = [[0, 1]]
+    arch = device_list[0]
     num_loops = 16
     total_microbatch_size = 128
+        
+    base_kwargs = {
+        "training": False, 
+        "microbatch": total_microbatch_size, 
+        "data_type": 'Fp16_b',
+        "math_fidelity": 'HiFi3',
+        "arch": "wormhole_b0",
+        "devtype": "silicon",
+    }
     
-    models = get_models()
-    os.environ["PYBUDA_FORCE_THREADS"] = "1"
-    clean_env = os.environ.copy()
+    model_to_config = {
+        "resnet": "resnet50", 
+        "bert": "base"
+    }
     
-    for model, config in model_to_config.items():
-        kwargs = base_kwargs.copy()
-        func = models[model]["func"]
-        available_parameters = inspect.signature(func).parameters
-        for p in available_parameters:
-            if p == "config":
-                kwargs["config"] = config
-            elif p == "force_num_layers":
-                kwargs["force_num_layers"] = 0
-        
-        model_config = func(**kwargs)
+    output_dir="device_images_n300_dp/"
+    os.makedirs(output_dir, exist_ok=True)
     
+    for model, config in model_to_config.items():
+        model_config = get_model_config(base_kwargs, model, config)
         duts, inputs, targets, other = model_config
         module = duts['tt']
-        run_result: RunResult = run_tti_data_parallel(
+        image_path = os.path.join(output_dir, f"{model}.tti")
+        compile_and_save_tti(
             module=module,
+            arch=arch,
+            num_chips=1,
+            tti_output_path=image_path,
+            sample_inputs=inputs,
+        )
+        run_result: RunResult = run_tti_data_parallel(
+            precompiled_tti_path=image_path,
             run_mode=RunMode.FORWARD,
             inputs=ForwardRunInputs(inputs=inputs),
             arch=arch,
-            device_ids=mmio_device_ids,
+            device_ids=device_ids,
             num_loops=num_loops,
             output_dir=output_dir,
             sync_at_run_start=True
         )
         outputs = run_result.outputs
         cpu_outputs = [module.cpu_eval_forward(*inputs)] * num_loops
-        
         check_outputs(cpu_outputs, outputs)
         
         pybuda_reset()
-        os.environ = clean_env
\ No newline at end of file
+        os.environ = clean_env
+    
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
\ No newline at end of file

From 5e6280cf160d432218e8cf40b560a18f9464081b Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan <akannan@tenstorrent.com>
Date: Wed, 26 Jun 2024 16:01:10 +0000
Subject: [PATCH 007/116] Fix fuse parse error in DistilBert

(cherry picked from commit 5c97fa92d0a4ff6961631f21568b664ad3cd0bb4)
---
 .../model_demos/high_prio/nlp/pytorch/test_distilbert.py  | 8 ++++++++
 third_party/tvm                                           | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
index cfde9ff5..e2ece2cf 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
@@ -27,6 +27,8 @@ def test_distilbert_masked_lm_pytorch(variant, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+
+    os.environ["PYBUDA_DISABLE_MASKED_FILL_V2"] = "1"
     
     # Load data sample
     sample_text = "The capital of France is [MASK]."
@@ -63,6 +65,8 @@ def test_distilbert_question_answering_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    os.environ["PYBUDA_DISABLE_MASKED_FILL_V2"] = "1"
+
     if test_device.arch == BackendDevice.Grayskull and test_device.devtype == pybuda.BackendType.Golden:
         os.environ["PYBUDA_EXTRA_L1_MARGIN"] = '169536'
         compiler_cfg.enable_auto_fusing = False
@@ -113,6 +117,8 @@ def test_distilbert_sequence_classification_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    os.environ["PYBUDA_DISABLE_MASKED_FILL_V2"] = "1"
+
     # Load data sample
     review = "the movie was great!"
 
@@ -147,6 +153,8 @@ def test_distilbert_token_classification_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    os.environ["PYBUDA_DISABLE_MASKED_FILL_V2"] = "1"
+
     # Load data sample
     sample_text = "HuggingFace is a company based in Paris and New York"
 
diff --git a/third_party/tvm b/third_party/tvm
index bcc63db9..6b61b3e8 160000
--- a/third_party/tvm
+++ b/third_party/tvm
@@ -1 +1 @@
-Subproject commit bcc63db9729665b2d779c2b373362d7d5dddfb4d
+Subproject commit 6b61b3e805f94ab16393fe35edaa4e650089933c

From b1a7f463ffbf63a4dfe9150c615540c9156cb2a9 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Thu, 27 Jun 2024 10:49:25 +0000
Subject: [PATCH 008/116] Remove Models in Push Pipeline

(cherry picked from commit 7a89ab42af1145d9c7027e509abeb2c0b4cb38fa)
---
 .../cnn/pytorch/tests_A/test_autoencoder.py   | 147 ------------------
 .../tvm/cnn/pytorch/tests_A/test_convnext.py  | 112 -------------
 .../tvm/cnn/pytorch/tests_A/test_dalle_vae.py |  71 ---------
 .../cnn/pytorch/tests_A/test_efficientnet.py  | 127 ---------------
 .../tvm/cnn/pytorch/tests_A/test_googlenet.py |  84 ----------
 .../tvm/cnn/pytorch/tests_A/test_gscnn.py     |  83 ----------
 .../tvm/cnn/pytorch/tests_A/test_hrnet.py     | 112 -------------
 .../tvm/cnn/pytorch/tests_A/test_midas.py     |  45 ------
 8 files changed, 781 deletions(-)
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_midas.py

diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
deleted file mode 100644
index 8ad70db9..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# AutoEncoder basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-from torch import nn
-
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-# SPDX-FileCopyrightText: Copyright (c) 2018 Udacity
-#
-# SPDX-License-Identifier: MIT
-# https://github.com/udacity/deep-learning-v2-pytorch
-class LinearAE(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-        # Encoder
-        self.encoder_lin1 = nn.Linear(784, 128)
-        self.encoder_lin2 = nn.Linear(128, 64)
-        self.encoder_lin3 = nn.Linear(64, 12)
-        self.encoder_lin4 = nn.Linear(12, 3)
-
-        # Decoder
-        self.decoder_lin1 = nn.Linear(3, 12)
-        self.decoder_lin2 = nn.Linear(12, 64)
-        self.decoder_lin3 = nn.Linear(64, 128)
-        self.decoder_lin4 = nn.Linear(128, 784)
-
-        self.act_fun = nn.ReLU()
-
-    def forward(self, x):
-        # Encode
-        act = self.encoder_lin1(x)
-        act = self.act_fun(act)
-        act = self.encoder_lin2(act)
-        act = self.act_fun(act)
-        act = self.encoder_lin3(act)
-        act = self.act_fun(act)
-        act = self.encoder_lin4(act)
-
-        # Decode
-        act = self.decoder_lin1(act)
-        act = self.act_fun(act)
-        act = self.decoder_lin2(act)
-        act = self.act_fun(act)
-        act = self.decoder_lin3(act)
-        act = self.act_fun(act)
-        act = self.decoder_lin4(act)
-
-        return act
-
-
-class ConvAE(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-        # Encoder
-        self.encoder_conv2d_1 = nn.Conv2d(3, 16, 3, padding=1)
-        self.encoder_conv2d_2 = nn.Conv2d(16, 4, 3, padding=1)
-        self.encoder_max_pool2d = nn.MaxPool2d(2, 2)
-
-        # Decoder
-        self.decoder_conv2d_1 = nn.ConvTranspose2d(4, 16, 2, stride=2)
-        self.decoder_conv2d_2 = nn.ConvTranspose2d(16, 3, 2, stride=2)
-
-        self.act_fun = nn.ReLU()
-
-    def forward(self, x):
-        # Encode
-        act = self.encoder_conv2d_1(x)
-        act = self.act_fun(act)
-        act = self.encoder_max_pool2d(act)
-        act = self.encoder_conv2d_2(act)
-        act = self.act_fun(act)
-        act = self.encoder_max_pool2d(act)
-
-        # Decode
-        act = self.decoder_conv2d_1(act)
-        act = self.act_fun(act)
-        act = self.decoder_conv2d_2(act)
-
-        return act
-
-
-def test_linear_autoencoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN" 
-
-    framework_model = LinearAE()
-    module = PyTorchModule(
-        "autoencoder",
-        framework_model,
-    )
-
-    input_shape = (1, 784)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_conv_autoencoder(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    framework_model = ConvAE()
-    module = PyTorchModule(
-        "pt_conv_autoencoder",
-        framework_model,
-    )
-
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
deleted file mode 100644
index 3b8dd25a..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# ConvNeXt basic bring-up tests of tracing functionality
-#
-import pytest
-
-from transformers import ConvNextModel
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-import pybuda
-
-
-def test_convnext_tiny(test_kind, test_device):
-    pytest.skip()
-    # Always run with recompute in post-commit CI. Nightly tests both
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-        
-    # import os
-    # os.environ["PYBUDA_PRINT_GRAPH_VIZ_FORMAT_AT"] = "PRE_PLACER"
-    # os.environ["PYBUDA_PRINT_GRAPH_VIZ_FORMAT_DIR"] = "forward_only"
-    # os.environ["PYBUDA_PRINT_GRAPH_VIZ_FORMAT_DIR"] = "backward_only"
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # tenstorrent/pybuda#365
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.retain_tvm_python_files = True
-
-    framework_model = download_model(ConvNextModel.from_pretrained, "facebook/convnext-tiny-224", torchscript=True)
-    module = PyTorchModule("pt_convnext_tiny", framework_model)
-
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.97,
-        ),
-    )
-
-
-def test_convnext_embeddings(test_kind, test_device):
-    pytest.skip() # Already testing with full model
-
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )  # Unsupported HW ops
-
-    framework_model = download_model(ConvNextModel.from_pretrained, "facebook/convnext-tiny-224", torchscript=True)
-    framework_model = framework_model.embeddings
-    module = PyTorchModule("convnext_embeddings", framework_model)
-
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_convnext_layer(test_kind, test_device):
-    pytest.skip() # Already testing with full model
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )  # Unsupported HW ops
-
-    framework_model = download_model(ConvNextModel.from_pretrained, "facebook/convnext-tiny-224", torchscript=True)
-    framework_model = framework_model.encoder.stages[0].layers[0]
-    module = PyTorchModule("convnext_layer", framework_model)
-
-    input_shape = (1, 96, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
deleted file mode 100644
index 63ba98e7..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-import os
-
-import torch
-import torch.nn as nn
-from test.tvm.cnn.pytorch.dall_e_vae import Encoder, Decoder
-
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import pybuda
-
-
-def test_tvm_dalle_Encoder(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    input_shape = (1, 3, 224, 224)
-
-    model = Encoder()
-    mod = PyTorchModule("DALLE_vae_encoder", model.blocks[:3])  # Reduce compile time
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_tvm_dalle_Decoder(test_kind, test_device):
-    if test_device.arch == pybuda.BackendDevice.Grayskull:
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    input_shape = (1, 8192, 32, 32)
-
-    model = Decoder()
-    mod = PyTorchModule("DALLE_vae_encoder", model.blocks[:3])  # Reduce compile time
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
deleted file mode 100644
index 254163b7..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-    BackendType,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-def test_efficientnet_layer(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = download_model(torch.hub.load, 
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_b0",
-        pretrained=True,
-    )
-    module = PyTorchModule("efficientnet_b0_layer_torch", model.layers[0])
-
-    input_shape = (1, 32, 112, 112)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_efficientnet_stem(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    #if test_kind.is_training():
-    #    pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = download_model(torch.hub.load, 
-        "NVIDIA/DeepLearningExamples:torchhub",
-        "nvidia_efficientnet_b0",
-        pretrained=True,
-    )
-    module = PyTorchModule("efficientnet_b0_stem_torch", model.stem)
-
-    pcc = 0.98 if test_device.devtype == BackendType.Silicon and test_kind.is_training() else 0.99
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-        ),
-    )
-
-def test_efficientnet_b0(test_kind, test_device):
-    pytest.skip()
-    #if test_kind.is_training():
-    #    pytest.skip()  # Backward is currently unsupported
-
-    import timm
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = timm.create_model('efficientnet_b0', pretrained=True)
-    module = PyTorchModule("efficientnet_b0", model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-def test_efficientnet_b4(test_kind, test_device):
-    pytest.skip()
-    #if test_kind.is_training():
-    #    pytest.skip()  # Backward is currently unsupported
-
-    import timm
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = timm.create_model('efficientnet_b4', pretrained=True)
-    module = PyTorchModule("efficientnet_b0", model)
-
-    input_shape = (1, 3, 320, 320)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
deleted file mode 100644
index a1d35688..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import pytest
-
-import torch
-from torchvision import transforms, models
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-import pybuda
-
-
-def test_tvm_googlenet(test_kind, test_device):
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
-        pytest.skip("Skip for Wormhole_B0")
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "30000"
-    compiler_cfg = _get_global_compiler_config()
-
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    pytorch_model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "googlenet", pretrained=True
-    )
-    module = PyTorchModule("googlenet", pytorch_model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.98,
-        ),
-    )
-
-
-def test_googlenet_torchvision(test_kind, test_device):
-    
-    pytest.skip("Needs padding")
-    
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    # Adding required environment variables as per https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/43
-    import os
-    # This will allow the test to pass but we should use conv padding to fix the issue instead
-    # os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "30000"
-    # unknown padding to add 
-    compiler_cfg = _get_global_compiler_config()
-
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = download_model(models.googlenet, pretrained=True)
-    module = PyTorchModule("googlenet_pt", model)
-
-    input_shape = (1, 3, 256, 256)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
index ba1c77cb..5c8de7dd 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
@@ -16,31 +16,7 @@
 from test.utils import download_model
 
 
-def test_gscnn_pytorch(test_kind, test_device):
-    pytest.skip() # Takes too long to compile/run
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    
-    model = download_model(get_model, 
-        network="gscnn.gscnn.GSCNN",
-        num_classes=30,
-        criterion=None, # Only needed for training
-        trunk="resnet18",
-    )
-
-    module = PyTorchModule("gscnn_torch", model)
 
-    input_shape = (1, 3, 1024, 2048)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
 
 from test.tvm.cnn.pytorch.gscnn.wider_resnet import wider_resnet38_a2
 
@@ -75,62 +51,3 @@ def test_wider_resnet_torch(test_kind, test_device):
             test_kind=test_kind,
         ),
     )
-
-
-import test.tvm.cnn.pytorch.gscnn.gated_spatial_conv as gsc
-
-def test_gated_spatial_conv_torch(test_kind, test_device):
-    pytest.skip() #TODO: Debug why this runs out of memory
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = gsc.GatedSpatialConv2d(32, 32)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS # tenstorrent/pybuda#185
-
-    module = PyTorchModule("gated_spatial_conv_torch", model)
-
-    input_shape0 = (1, 32, 1024, 2048)
-    input_shape1 = (1, 1, 1024, 2048)
-    verify_module(
-        module,
-        (input_shape0, input_shape1),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-from test.tvm.cnn.pytorch.gscnn.gscnn import _AtrousSpatialPyramidPoolingModule
-# Need to support non-square pooling + convolution (kernel size and stride)
-def test_spatial_pyramid_pooling_torch(test_kind, test_device):
-    pytest.skip()
-
-    model = _AtrousSpatialPyramidPoolingModule(4096, 256, output_stride=8)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    module = PyTorchModule("spatial_pyramid_pooling_torch", model)
-
-    input_shape0 = (1, 4096, 128, 256)
-    input_shape1 = (1, 1, 1024, 2048)
-    verify_module(
-        module,
-        (input_shape0, input_shape1),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
deleted file mode 100644
index 754826d1..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# High-Resolution Network (HRNet) basic bring-up tests of tracing functionality
-#
-import pytest
-
-import timm
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-    BackendType,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_hrnet_full_model(test_kind, test_device):
-    if test_device.devtype != BackendType.Silicon:
-        pytest.skip()  # Testing full model on nightly CI
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    framework_model = timm.create_model("hrnet_w18")
-    module = PyTorchModule(
-        "hrnet",
-        framework_model,
-    )
-
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_hrnet_basic_block(test_kind, test_device):
-    if (
-        test_kind.is_training()
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )  # Unsupported HW ops
-
-    framework_model = timm.create_model("hrnet_w18")
-    framework_model = framework_model.stage2[0].branches[0][0]
-    module = PyTorchModule(
-        "hrnet_basic_block",
-        framework_model,
-    )
-
-    input_shape = (1, 18, 9, 9)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_hrnet_fuse_layer(test_kind, test_device):
-    if (
-        test_kind.is_training()
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )  # Unsupported HW ops
-
-    framework_model = timm.create_model("hrnet_w18")
-    framework_model = framework_model.stage2[0].fuse_layers[0][1]
-    module = PyTorchModule(
-        "hrnet_basic_block",
-        framework_model,
-    )
-
-    input_shape = (1, 36, 9, 9)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_midas.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_midas.py
deleted file mode 100644
index c6ea1468..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_midas.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_MIDAS_pytorch(
-    test_kind,
-    test_device,
-):
-    pytest.skip()  # Takes too long post commit
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.POST_INITIAL_GRAPH_PASS
-
-    model = download_model(torch.hub.load, "intel-isl/MiDaS", "MiDaS_small")
-    module = PyTorchModule("MIDAS_torch", model)
-
-    input_shape = (1, 3, 384, 384)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )

From e9061d2cefd1496f1729b99612f1a6d128d65aae Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 27 Jun 2024 13:56:35 +0000
Subject: [PATCH 009/116] Fix pybuda pipeline failures (26/06/2024)

(cherry picked from commit 7513f6670c5bd73632bbc4beb73909643ec0feb4)
---
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py | 7 +++----
 .../test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py  | 3 ---
 .../model_demos/high_prio/cnn/pytorch/test_efficientnet.py | 3 +++
 .../model_demos/high_prio/cnn/pytorch/test_xception.py     | 6 ++++++
 pybuda/test/model_demos/models/xception.py                 | 1 +
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
index b25f5bc3..54f5b412 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_clip.py
@@ -112,6 +112,7 @@ def test_clip_pytorch(test_device):
     # to check this out in more details:
     # tenstorrent/pybuda#1828
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+    os.environ["PYBUDA_DISABLE_MASKED_FILL_V2"] = "1"
 
     # Load processor and model from HuggingFace
     model_ckpt = "openai/clip-vit-base-patch32"
@@ -148,10 +149,8 @@ def test_clip_pytorch(test_device):
     prob_cat = float(f"{probs[0].tolist()[0]*100:.1f}")
     prob_dog = float(f"{probs[0].tolist()[1]*100:.1f}")
 
-    # Pcc drop due to Masked_fill op kernel 
-    # Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
-    # assert 99.3 <= prob_cat
-    # assert 0.7 >= prob_dog
+    assert 99.3 <= prob_cat
+    assert 0.7 >= prob_dog
 
     processed_output = list(zip(text, probs[0].tolist()))
     print("RESULTS")
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
index 374ddebf..014ea697 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -22,9 +22,6 @@
 from semseg import DualResNet, BasicBlock_seg
 
 
-torch.multiprocessing.set_sharing_strategy("file_system")
-
-
 variants = ["ddrnet23s", "ddrnet23", "ddrnet39"]
 
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index 23ad86cb..4fdc2ea2 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -61,6 +61,9 @@ def test_efficientnet_timm(variant, test_device):
             os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16}"
             os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
             os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
 
     # Load model
     framework_model = download_model(timm.create_model, variant, pretrained=True)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
index 546dc7d2..e3e6353b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
@@ -29,6 +29,11 @@ def test_xception_timm(variant, test_device):
         variant,
     )
 
+    if test_device.arch == BackendDevice.Grayskull and variant == "xception":
+        pcc_value = 0.95
+    else :
+        pcc_value = 0.99
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
@@ -39,5 +44,6 @@ def test_xception_timm(variant, test_device):
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
             num_chips=1,
+            pcc = pcc_value,
         ),
     )
diff --git a/pybuda/test/model_demos/models/xception.py b/pybuda/test/model_demos/models/xception.py
index b3e889ad..96092df5 100644
--- a/pybuda/test/model_demos/models/xception.py
+++ b/pybuda/test/model_demos/models/xception.py
@@ -21,6 +21,7 @@ def generate_model_xception_imgcls_timm(test_device, variant):
         if test_device.arch == BackendDevice.Wormhole_B0:
             compiler_cfg.balancer_policy = "CNN"
         elif test_device.arch == BackendDevice.Grayskull:
+            compiler_cfg.balancer_policy = "Ribbon"
             compiler_cfg.amp_level = 1
     else:
         compiler_cfg.balancer_policy = "Ribbon"

From 1e2b5653da801e5f8f3d4f8634f489df2f3df401 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Fri, 28 Jun 2024 08:13:40 +0000
Subject: [PATCH 010/116] Removed few models in tests_A

(cherry picked from commit d8ab6087d3fbccd76a1415a5b5388a77b25e74cf)
---
 .../tvm/cnn/pytorch/tests_A/test_alexnet.py   |  52 -----
 .../tvm/cnn/pytorch/tests_A/test_deeplab.py   |  44 -----
 .../tvm/cnn/pytorch/tests_A/test_gscnn.py     |  53 -----
 .../tvm/cnn/pytorch/tests_A/test_inception.py | 183 ------------------
 .../tvm/cnn/pytorch/tests_A/test_mnasnet.py   |  45 -----
 5 files changed, 377 deletions(-)
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_deeplab.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_inception.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_A/test_mnasnet.py

diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
deleted file mode 100644
index 66d93b9b..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Alexnet basic bring-up tests of tracing functionality
-#
-import time
-import numpy as np
-import pytest
-
-import math
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-import pybuda
-
-def test_tvm_alexnet(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if (test_kind == TestKind.TRAINING_RECOMPUTE):
-        pytest.skip()  # tenstorrent/pybuda#215
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = (
-            CompileDepth.GENERATE_INITIAL_GRAPH
-        )  # Pooling backward is unimplemented
-
-    pytorch_model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "alexnet", pretrained=True)
-    module = PyTorchModule("pt_alexnet", pytorch_model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_deeplab.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_deeplab.py
deleted file mode 100644
index dc6a31d1..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_deeplab.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_deeplabv3_pytorch(test_kind, test_device):
-    pytest.skip()  # Running full models on nightly
-
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )  # Unsupported HW ops
-
-    model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "deeplabv3_resnet50", pretrained=True
-    )
-    module = PyTorchModule("deeplabv3_resnet50", model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
deleted file mode 100644
index 5c8de7dd..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_gscnn.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from test.tvm.cnn.pytorch.gscnn import get_model
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-
-
-from test.tvm.cnn.pytorch.gscnn.wider_resnet import wider_resnet38_a2
-
-def test_wider_resnet_torch(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = wider_resnet38_a2(classes=1000, dilation=True)
-    submodel = torch.nn.Sequential(
-        model.mod1,
-        model.pool2,
-    )
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH # Needs neg maxpool support tenstorrent/pybuda#188
-
-    module = PyTorchModule("wider_resnet_torch", submodel)
-
-    input_shape = (1, 3, 1024, 2048)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_inception.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_inception.py
deleted file mode 100644
index 91a7b830..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_inception.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-
-def test_inceptionv3_a_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True).Mixed_5b
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 192, 35, 35)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_inceptionv3_b_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True).Mixed_6a
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_conv_prestride = False  # tenstorrent/pybuda#925
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 288, 35, 35)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_inceptionv3_c_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True).Mixed_6b
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 768, 17, 17)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_inceptionv3_d_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True).Mixed_7a
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 768, 17, 17)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_inceptionv3_e_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True).Mixed_7b
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 1280, 8, 8)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-    
-def test_inceptionv3_full_pytorch(test_kind, test_device):
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "inception_v3", pretrained=True)
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    module_name = str(model.Mixed_7b.__class__).split(".")[-1].split("'")[0]
-    module = PyTorchModule(module_name, model)
-
-    input_shape = (1, 3, 128, 128)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_mnasnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_mnasnet.py
deleted file mode 100644
index b508b301..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_mnasnet.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-from torchvision.models.mnasnet import MNASNet
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_mnasnet(test_kind, test_device):
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    
-    if test_kind.is_training():
-        pytest.skip()
-    else:
-        compiler_cfg.compile_depth = (
-            CompileDepth.BUDA_GRAPH_PRE_PLACER
-        )  # Unsupported HW ops
-
-    framework_model = MNASNet(1.0)
-    module = PyTorchModule(
-        "mnasnet",
-        framework_model,
-    )
-
-    input_shape = (1, 3, 64, 64)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )

From bca7bcefdbd5e4744f5511527e70c5061c2e90d4 Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Tue, 2 Jul 2024 07:34:44 +0000
Subject: [PATCH 011/116] Fix dla and efficientnet model ci failures

(cherry picked from commit b9d0fecd8e40bb97762bbb860e8501882f28edca)
---
 pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py |  4 +++-
 .../test/model_demos/high_prio/cnn/pytorch/test_dla.py |  2 ++
 .../high_prio/cnn/pytorch/test_efficientnet.py         | 10 +++-------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
index be50789a..8786465b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
@@ -71,8 +71,10 @@ def test_dla_onnx(test_device, variant):
         elif variant == "dla169":
             pcc = 0.96
     elif test_device.arch == BackendDevice.Grayskull:
-        if variant == "dla46_c":
+        if variant in ["dla46_c", "dla102x2", "dla169"]:
             pcc = 0.97
+        if variant in ["dla60", "dla102x"]:
+            pcc = 0.98
         if variant == "dla102x2":
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py
index ada995f0..939b9cfe 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py
@@ -58,6 +58,8 @@ def test_dla_pytorch(variant, test_device):
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
         if func.__name__ == "dla46_c":
             pcc = 0.97
+        if func.__name__ in ["dla60", "dla102x", "dla102x2", "dla169"]:
+            pcc = 0.98
 
     # Load data sample
     url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index 4fdc2ea2..37f6bb35 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -43,6 +43,7 @@ def test_efficientnet_timm(variant, test_device):
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.enable_auto_fusing = False
 
+    pcc_value = 0.94
     if variant == "efficientnet_b0":
         # Solves issue for bigger conv layers in the middle of the graph
         if test_device.arch == BackendDevice.Wormhole_B0:
@@ -55,15 +56,10 @@ def test_efficientnet_timm(variant, test_device):
 
     elif variant == "efficientnet_b4":
         if test_device.arch == BackendDevice.Wormhole_B0:
+            pcc_value = 0.92
             compiler_cfg.amp_level = 1
             compiler_cfg.default_df_override=pybuda.DataFormat.Float16_b
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
-            os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16}"
-            os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
-            os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
-            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
-            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
-            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
 
     # Load model
     framework_model = download_model(timm.create_model, variant, pretrained=True)
@@ -102,7 +98,7 @@ def test_efficientnet_timm(variant, test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            pcc=0.94,
+            pcc=pcc_value,
         ),
     )
 

From 1378af1d4dd500b07d3617e6e4300c39c96130b1 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Tue, 2 Jul 2024 12:33:18 +0000
Subject: [PATCH 012/116] Fix ddrnet core dump issue

(cherry picked from commit af6bd27ebb955e31239ddae4004d245e82ccd701)
---
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
index 014ea697..4d0470ef 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -117,6 +117,8 @@ def test_ddrnet_semantic_segmentation_pytorch(variant, test_device):
     ):
         compiler_cfg.enable_auto_fusing = False
         compiler_cfg.amp_level = 2
+        os.environ["PYBUDA_BALANCER_USE_DRAM_BW_ESTIMATES"] = "1"
+        os.environ["PYBUDA_BALANCER_USE_NOC_BW_ESTIMATES"] = "1"
 
     # prepare model
     if variant == "ddrnet23s_cityscapes":

From c62d799467689c7a5f73f2ffbf6441fc132f8afb Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Tue, 2 Jul 2024 11:58:17 +0000
Subject: [PATCH 013/116] Add tests for yolox(pytorch) model - GS(e300 & e150)

(cherry picked from commit e999f4183ac877bc35b79225b6093bfd482d8407)
---
 .../high_prio/cnn/pytorch/test_yolo_x.py      | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
index 9e802e52..8c615809 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
@@ -112,6 +112,77 @@ def test_yolox_pytorch(variant, test_device):
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
                 compiler_cfg.place_on_new_epoch("concatenate_2264.dc.sparse_matmul.11.lc2")
 
+    elif test_device.arch == BackendDevice.Grayskull:
+
+        if variant in ["yolox_nano", "yolox_tiny"]:
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 1))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "81920"
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.14.lc2", "t_stream_shape", (1, 13))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.14.lc2", "t_stream_shape", (1, 13))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.14.lc2", "t_stream_shape", (1, 13))
+
+        elif variant == "yolox_s":
+            compiler_cfg.balancer_op_override("concatenate_1163.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.place_on_new_epoch("concatenate_1163.dc.sparse_matmul.11.lc2")
+
+        elif variant in ["yolox_m", "yolox_l", "yolox_darknet", "yolox_x"]:
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+            if variant == "yolox_m":
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+                compiler_cfg.place_on_new_epoch("concatenate_1530.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_l":
+                compiler_cfg.place_on_new_epoch("conv2d_1410.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1644.dc.matmul.11")
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_1897.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_darknet":
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.3.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.5.dc.matmul.11")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "12288"
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.5.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2")
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1147.dc.matmul.11")
+                compiler_cfg.balancer_op_override("concatenate_1242.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+
+            elif variant == "yolox_x":
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.5.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.3.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1699.dc.matmul.8")
+                compiler_cfg.place_on_new_epoch("conv2d_1732.dc.matmul.8")
+                compiler_cfg.place_on_new_epoch("conv2d_1981.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1736.dc.matmul.11")
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_2264.dc.sparse_matmul.11.lc2")
+
     # prepare model
     weight_name = f"{variant}.pth"
     url = f"https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/{weight_name}"
@@ -152,6 +223,14 @@ def test_yolox_pytorch(variant, test_device):
     if test_device.arch == BackendDevice.Wormhole_B0 and variant == "yolox_nano":
         pcc_value = 0.96
 
+    elif test_device.arch == BackendDevice.Grayskull:
+        if variant in ["yolox_nano", "yolox_s", "yolox_l", "yolox_x"]:
+            pcc_value = 0.93
+        elif variant in ["yolox_m,yolox_darknet"]:
+            pcc_value = 0.92
+        elif variant == "yolox_tiny":
+            pcc_value = 0.98
+
     # Inference
     verify_module(
         tt_model,

From 495a82232b57e88625c3a3088f16ef54cd511eb5 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Mon, 24 Jun 2024 15:37:18 +0000
Subject: [PATCH 014/116] Save failing tests separately

Issue #2755

(cherry picked from commit 928933eade3faec99f0742f34f3e64266b7b40fa)
---
 pybuda/test/random/rgg/base.py                      | 13 +++++++++++--
 pybuda/test/random/rgg/config.py                    |  1 +
 pybuda/test/random/rgg/datatypes.py                 |  1 +
 .../test/random/rgg/pybuda/generated_model.jinja2   |  2 +-
 .../test/random/rgg/pytorch/generated_model.jinja2  |  2 +-
 5 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index e4f8d6e9..87d81f8b 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -203,8 +203,11 @@ def verify(self, model: PyBudaModule) -> None:
         verify_module(model, input_shapes,
                       VerifyConfig(devtype=parameters.test_device.devtype, arch=parameters.test_device.arch))
 
-    def save_test(self, test_code_str: str):
+    def save_test(self, test_code_str: str, failing_test: bool = False):
         test_dir = self.test_context.randomizer_config.test_dir
+        if failing_test:
+            test_dir = f"{test_dir}/failing_tests"
+            test_code_str = test_code_str.replace("# @pytest.mark.xfail", "@pytest.mark.xfail") 
         test_code_file_name = f"{test_dir}/test_gen_model_{StrUtils.test_id(self.test_context)}.py"
 
         if not os.path.exists(test_dir):
@@ -251,7 +254,7 @@ def run(self, graph_builder: GraphBuilder):
 
         if randomizer_config.save_tests:
             # saving test source code to file for debugging purposes
-            self.save_test(test_code_str)
+            self.save_test(test_code_str, failing_test=False)
 
         logger.debug(f"Graph built in: {graph_duration.get_duration():.4f} seconds")
 
@@ -261,8 +264,14 @@ def run(self, graph_builder: GraphBuilder):
             # perform model validation
             try:
                 verify_duration = Timer()
+                verify_successful = False
                 self.verify(model)
+                verify_successful = True
             finally:
+                if not verify_successful:
+                    if randomizer_config.save_failing_tests:
+                        # saving error test source code to file for debugging purposes
+                        self.save_test(test_code_str, failing_test=True)
                 logger.debug(f"Test verified in: {verify_duration.get_duration():.4f} seconds")
         else:
             logger.info("Skipping test run")
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index 58f11d8e..e4453cf6 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -17,6 +17,7 @@ def get_randomizer_config_default():
         print_code = True,
         run_test = True,
         save_tests = True,
+        save_failing_tests = True,
         # build_model_from_code = False,
         debug_shapes = False,
         verify_shapes = False,
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 8a2c6394..d58be976 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -81,6 +81,7 @@ class RandomizerConfig:
     run_test: bool = True
     test_dir:str = "pybuda/test/random_tests"
     save_tests: bool = False
+    save_failing_tests: bool = False
     # build_model_from_code: bool = False  # TODO remove obsoleted
     debug_shapes: bool = False,
     verify_shapes: bool = False,
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index 5b339c4f..bb8be3d1 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -35,7 +35,7 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
         return v
 {% if test_format %}
 
-# @pytest.mark.skip(reason="Skip this test for now.")
+# @pytest.mark.xfail(reason="The model triggers a bug.")
 def test_gen_model_{{ test_index }}_{{ random_seed }}(test_device):
     
     input_shapes = [
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index 7e556411..0e616a6a 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -33,7 +33,7 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
         return v
 {% if test_format %}
 
-# @pytest.mark.skip(reason="Skip this test for now.")
+# @pytest.mark.xfail(reason="The model triggers a bug.")
 def test_gen_model_{{ test_index }}_{{ random_seed }}(test_device):
     
     input_shapes = [

From e0284fe09972ac49c72ddc4f5fa10c5a6af665a5 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 2 Jul 2024 07:54:29 +0000
Subject: [PATCH 015/116] Move random seeds to test_context

Issue #2755

(cherry picked from commit 382c40a28d1e61108a0ff43e3e21fa2a995bb1ff)
---
 pybuda/test/random/rgg/algorithms.py | 38 +++++++++++++++-------------
 pybuda/test/random/rgg/base.py       |  9 +++++++
 pybuda/test/random/rgg/datatypes.py  |  8 ++++++
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 9e7880b2..473c805a 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -4,7 +4,6 @@
 # Implementation of randomization algrorithms
 
 
-import random
 from typing import List
 from loguru import logger
 
@@ -25,7 +24,7 @@ class GraphNodeSetup:
     always_unique_variables = False
 
     @classmethod
-    def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
+    def init_nodes(cls, test_context: RandomizerTestContext):
         """
         Initializes the nodes of a graph. 
 
@@ -36,7 +35,7 @@ def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
         4. Generates random settings for operator parameters.
 
         Args:
-            graph (RandomizerGraph): The graph nodes to initialize.
+            test_context (RandomizerTestContext): The test context.
 
         Raises:
             Exception: If the number of inputs for a node does not match the configured input number.
@@ -45,7 +44,11 @@ def init_nodes(cls, graph: RandomizerGraph, rng_params: random.Random):
         Returns:
             None
         """
-        nodes = graph.nodes
+        graph = test_context.graph
+        nodes = test_context.graph.nodes
+
+        rng_shape = test_context.rng_shape
+        rng_params = test_context.rng_params
 
         # Setting node.index
         op_index_cnt = 0
@@ -115,9 +118,12 @@ def validate_graph(cls, graph: RandomizerGraph):
                 raise Exception(f"Step operator is wrong type {node.node_info()} expected RandomizerOperator got {type(node.operator)}")
 
     @classmethod
-    def prepare_graph(cls, graph: RandomizerGraph, rng_params: random.Random):
+    def prepare_graph(cls, test_context: RandomizerTestContext):
+
+        graph = test_context.graph
+
         logger.trace("Initializing nodes")
-        cls.init_nodes(graph, rng_params)
+        cls.init_nodes(test_context)
         logger.trace("Nodes initialized")
 
         logger.trace("Validating graph")
@@ -165,18 +171,12 @@ def _init_default_constructor_params(self, node: RandomizerNode):
     # Input shapes for each node are calculated based on output shape of the node
     def build_graph(self, test_context: RandomizerTestContext):
         '''Implementation of the random graph building algorithm'''
-        parameters = test_context.parameters
+
         graph = test_context.graph
         nodes = graph.nodes
 
-        # Initialize random number generators for graph building
-        rng_graph = random.Random(parameters.random_seed)
-
-        # Initialize random number generators for shape generation
-        rng_shape = random.Random(test_context.parameters.random_seed)
-
-        # Initialize random number generators for parameters
-        rng_params = random.Random(test_context.parameters.random_seed)
+        rng_graph = test_context.rng_graph
+        rng_shape = test_context.rng_shape
 
         fork_join_counter = 0
         fork_join_max = test_context.randomizer_config.num_fork_joins_max
@@ -184,6 +184,8 @@ def build_graph(self, test_context: RandomizerTestContext):
         # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
         num_of_nodes = rng_graph.randint(self.randomizer_config.num_of_nodes_min, self.randomizer_config.num_of_nodes_max) 
         for node_index in range(num_of_nodes, 0, -1):
+            first_node = node_index == num_of_nodes
+
             # Choose operator randomly based on rng
             op1 = self._get_random_operator(rng_graph)
 
@@ -191,7 +193,7 @@ def build_graph(self, test_context: RandomizerTestContext):
             open_nodes = NodeUtils.get_open_nodes(nodes)
 
             # Select output shape for the new node
-            if len(open_nodes) == 0:
+            if first_node:
                 # For the first node set output shape as random shape
                 output_shape = RandomUtils.random_shape_from_config(self.randomizer_config, rng_shape)
             else:
@@ -209,7 +211,7 @@ def build_graph(self, test_context: RandomizerTestContext):
             # Closing multiple nodes will construct fork joins
             random_nodes: List[RandomizerNode]
 
-            if len(open_nodes) > 0:
+            if not first_node:
                 # There must be at least one node to close
                 subset_count_min = max(1, len(open_nodes) // 2)
                 subset_count_max = len(open_nodes)
@@ -254,5 +256,5 @@ def build_graph(self, test_context: RandomizerTestContext):
         logger.trace(f"Graph built with {len(nodes)} nodes")
 
         logger.trace("Preparing graph")
-        GraphNodeSetup.prepare_graph(graph, rng_params)
+        GraphNodeSetup.prepare_graph(test_context)
         logger.trace("Graph prepared")
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 87d81f8b..8dbe204b 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -9,6 +9,7 @@
 from dataclasses import dataclass
 from jinja2 import Environment, FileSystemLoader
 import os
+import random
 
 from pybuda import PyBudaModule
 from pybuda.verify import verify_module, VerifyConfig
@@ -177,6 +178,14 @@ def generate_code(self) -> str:
 
     def build_graph(self, graph_builder: GraphBuilder) -> None:
         self.test_context.graph = RandomizerGraph()
+
+        # Initialize random number generators for graph building
+        self.test_context.rng_graph = random.Random(self.test_context.parameters.random_seed)
+        # Initialize random number generators for shape generation
+        self.test_context.rng_shape = random.Random(self.test_context.parameters.random_seed)
+        # Initialize random number generators for parameters
+        self.test_context.rng_params = random.Random(self.test_context.parameters.random_seed)
+
         graph_builder.build_graph(self.test_context)
 
     def build_model(self) -> PyBudaModule:
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index d58be976..90a52d72 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -6,6 +6,7 @@
 
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass, field
+import random
 import torch
 
 from pybuda.op_repo import OperatorDefinition
@@ -104,3 +105,10 @@ class RandomizerTestContext:
     # graph_builder: GraphBuilder
     graph: Optional[RandomizerGraph]  # graph will be constructed later during test processing
     test_name: str = "Default"
+
+    # random number generators for graph building
+    rng_graph: Optional[random.Random] = None
+    # random number generators for shape generation
+    rng_shape: Optional[random.Random] = None
+    # random number generators for parameters
+    rng_params: Optional[random.Random] = None

From 22d5485d0a71a2f131d89492cf85bad3f4105977 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 2 Jul 2024 14:34:24 +0000
Subject: [PATCH 016/116] Random input order

Issue #2755

(cherry picked from commit eb265b5e6607001e9a2612d6b17948ac8a3c3ac9)
---
 pybuda/test/README.debug.md          |  1 +
 pybuda/test/random/rgg/algorithms.py | 81 ++++++++++++++++++++++------
 pybuda/test/random/rgg/config.py     |  1 +
 pybuda/test/random/rgg/datatypes.py  |  8 ++-
 pybuda/test/random/rgg/utils.py      | 57 +++++++++++++++++---
 pybuda/test/utils.py                 |  2 +
 6 files changed, 126 insertions(+), 24 deletions(-)

diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index 3153dbdf..6753e5aa 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -13,3 +13,4 @@
  * NUM\_OF\_NODES\_MIN: Minimal number of nodes to be generated by RGG. (default: 5)
  * NUM\_OF\_NODES\_MAX: Maximum number of nodes to be generated by RGG. (default: 10)
  * NUM\_OF\_FORK\_JOINS\_MAX: Maximum number of fork joins to be generated by random graph algorithm in RGG. (default: 50)
+ * SAME\_INPUTS\_PERCENT\_LIMIT: Percent limit of nodes which have same value on multiple inputes. (default: 10)
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 473c805a..52354f6a 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -14,6 +14,7 @@
 from .base import RandomizerNode, GraphBuilder
 from .base import Framework
 from .utils import RandomUtils, StrUtils, NodeUtils
+from .utils import RateLimitter
 
 
 class GraphNodeSetup:
@@ -50,6 +51,8 @@ def init_nodes(cls, test_context: RandomizerTestContext):
         rng_shape = test_context.rng_shape
         rng_params = test_context.rng_params
 
+        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+
         # Setting node.index
         op_index_cnt = 0
         for node in nodes:
@@ -62,7 +65,7 @@ def init_nodes(cls, test_context: RandomizerTestContext):
             # setting default output variable name
             node.out_value = "v"
             for input_node in node.inputs:
-                if not NodeUtils.is_previous_node(node, input_node) or cls.always_unique_variables:
+                if (input_node is not None and not NodeUtils.is_previous_node(node, input_node)) or cls.always_unique_variables:
                     # overriding default output variable name
                     input_node.out_value = input_node.operator_name()
                     logger.trace(f"Set out_value = {input_node.out_value}")
@@ -74,15 +77,42 @@ def init_nodes(cls, test_context: RandomizerTestContext):
         # Setting input nodes for open nodes
         for node in open_nodes:
             input_shapes = node.input_shapes
-            for i in range(len(node.inputs), node.operator.input_num):
-                input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shapes[i] and input_node not in node.inputs]
-                if len(input_nodes_with_same_shape) > 0:
+            # list of input nodes that are already connected to the node
+            used_input_nodes: List[RandomizerInputNode] = []
+            for open_input_index in NodeUtils.get_open_input_indices(node):
+                input_shape = input_shapes[open_input_index]
+                # list of all graph input nodes with the same shape as the input shape
+                input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shape]
+                # list of input nodes with the same shape that are not already connected to the node
+                input_nodes_with_same_shape_unused = [input_node for input_node in input_nodes_with_same_shape if input_node not in used_input_nodes]
+                if len(input_nodes_with_same_shape_unused) > 0:
                     # reuse existing input node with the same shape that is not already connected to the node
-                    input_node = input_nodes_with_same_shape[0]
+                    input_node = input_nodes_with_same_shape_unused[0]
+                    used_input_nodes.append(input_node)
                 else:
-                    input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}", input_shape=input_shapes[i])
-                    graph.input_nodes.append(input_node)
-                node.inputs.append(input_node)
+                    # there are no input nodes with the same shape that are not already connected to the node
+                    # check if same input value is allowed
+                    # there must be at least one input node with the same shape to allow repeat
+                    allow_repeat = len(input_nodes_with_same_shape) > 0
+
+                    if allow_repeat:
+                        if not same_inputs_rate_limitter.is_allowed():
+                            logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.get_name()}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                            allow_repeat = False
+
+                    if allow_repeat:
+                        input_node = rng_shape.choice(input_nodes_with_same_shape)
+                        logger.trace(f"Allowed same input value {input_node.out_value} -> {node.get_name()}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                    
+                    else:
+                        # create a new input node with the same shape since there are no unused input nodes with the same shape or repeat is not allowed
+                        input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}", input_shape=input_shape)
+                        used_input_nodes.append(input_node)
+                        # store the new input node in the graph input nodes
+                        graph.input_nodes.append(input_node)
+                
+                # connect the input node to the open node input
+                node.inputs[open_input_index] = input_node
 
         logger.trace("Generating random settings for operator parameters")
         # Generate random values for operator parameters
@@ -109,8 +139,8 @@ def validate_graph(cls, graph: RandomizerGraph):
         # Validation of input configuration
         for node in nodes:
             if node.operator.input_num and node.operator.input_num > 1:
-                if len(node.inputs) != node.operator.input_num:
-                    raise Exception(f"Expected {node.operator.input_num} number of inputs but configured {node.inputs}")
+                if NodeUtils.num_of_open_inputs(node) > 0:
+                    raise Exception(f"Closed {NodeUtils.num_of_closed_inputs(node)}/{node.operator.input_num} inputs, missing {NodeUtils.num_of_open_inputs(node)} inputs for node {node.node_info()}")
 
         # Validation of operator and layer types
         for node in nodes:
@@ -181,6 +211,8 @@ def build_graph(self, test_context: RandomizerTestContext):
         fork_join_counter = 0
         fork_join_max = test_context.randomizer_config.num_fork_joins_max
 
+        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+
         # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
         num_of_nodes = rng_graph.randint(self.randomizer_config.num_of_nodes_min, self.randomizer_config.num_of_nodes_max) 
         for node_index in range(num_of_nodes, 0, -1):
@@ -202,7 +234,9 @@ def build_graph(self, test_context: RandomizerTestContext):
                 random_open_node: RandomizerNode = rng_graph.choice(open_nodes)
                 # Setting output shape based on input shapes of the random open node
                 input_shapes = random_open_node.input_shapes
-                output_shape = input_shapes[len(random_open_node.inputs)]
+                open_input_indices = [i for i in NodeUtils.get_open_input_indices(random_open_node)]
+                open_input_index = open_input_indices[rng_graph.randint(0, len(open_input_indices) - 1)]
+                output_shape = input_shapes[open_input_index]
 
             # Find all other open nodes with input shape mathing the output shape of new node
             open_nodes = NodeUtils.get_open_nodes_with_input_shape(nodes, output_shape)
@@ -229,6 +263,11 @@ def build_graph(self, test_context: RandomizerTestContext):
 
                 # Select random subset of open nodes to close
                 random_nodes = rng_graph.sample(open_nodes, subset_count)
+
+                if len(random_nodes) > 1:
+                    for random_node in random_nodes[1:]:
+                        logger.trace(f"Constructing new fork join from operator op{node_index} {op1.name} -> {random_node.get_name()}")
+
             else:
                 random_nodes = []
 
@@ -244,13 +283,21 @@ def build_graph(self, test_context: RandomizerTestContext):
             self._init_default_constructor_params(node)
 
             for closing_node in closing_nodes:
-                for _ in range(rng_graph.randint(1, closing_node.operator.input_num - len(closing_node.inputs))):
-                    # currently only if next input of closing node matches the output shape a closing node will be actually closed
-                    # TODO check all inputs for matching shapes not just next one
-                    if closing_node.input_shapes[len(closing_node.inputs)] == node.output_shape:
-                        closing_node.inputs.append(node)
+                node_connected = False
+                for open_input_index in NodeUtils.get_open_input_indices(closing_node):
+                    # check input shape of a closing node open input
+                    if closing_node.input_shapes[open_input_index] == node.output_shape:
+
+                        # Limit number of same inputs on same node
+                        if node_connected:
+                            if not same_inputs_rate_limitter.is_allowed():
+                                logger.trace(f"Skipping same input node connection op{node_index} {node.get_name()} -> {closing_node.get_name()}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                                continue
+                            else:
+                                logger.trace(f"Allowed same input node connection op{node_index} {node.get_name()} -> {closing_node.get_name()}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                        closing_node.inputs[open_input_index] = node
+                        node_connected = True
 
-            open_nodes.append(node)
             nodes.insert(0, node)
 
         logger.trace(f"Graph built with {len(nodes)} nodes")
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index e4453cf6..a50a6ff7 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -33,5 +33,6 @@ def get_randomizer_config_default():
         num_of_nodes_min=int(os.environ.get("NUM_OF_NODES_MIN", 5)),
         num_of_nodes_max=int(os.environ.get("NUM_OF_NODES_MAX", 10)),
         num_fork_joins_max=int(os.environ.get("NUM_OF_FORK_JOINS_MAX", 50)),
+        same_inputs_percent_limit=int(os.environ.get("SAME_INPUTS_PERCENT_LIMIT", 10)),
     )
     return randomizer_config
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 90a52d72..a3fb25db 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -27,12 +27,17 @@ class RandomizerNode:
     index: Optional[int] = None
     out_value: Optional[str] = None
     operator: Optional[OperatorDefinition] = None
-    inputs: List['RandomizerNode'] = field(default_factory=list)
+    inputs: List['RandomizerNode'] = field(init=False)
     constructor_kwargs: Dict[str, object] = field(default_factory=dict)
     forward_kwargs: Dict[str, object] = field(default_factory=dict)
     input_shapes: List[TensorShape] = field(default_factory=list)
     output_shape: TensorShape = None
 
+    def __post_init__(self):
+        # List of input nodes is initialized with None values for each input
+        # Inputs will be set later during graph construction
+        self.inputs = [None for _ in range(self.operator.input_num)]
+
     def operator_name(self):
         return f"op{self.index}"
 
@@ -95,6 +100,7 @@ class RandomizerConfig:
     num_of_nodes_min: int = 5
     num_of_nodes_max: int = 10
     num_fork_joins_max: int = 50
+    same_inputs_percent_limit: int = 10
 
 
 @dataclass
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index dc533199..316d9b94 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -5,7 +5,7 @@
 
 
 import random
-from typing import Callable, List, Dict
+from typing import Callable, Generator, List, Dict
 from dataclasses import asdict
 from loguru import logger
 import re
@@ -172,18 +172,41 @@ class NodeUtils:
     def is_previous_node(node: RandomizerNode, previous_node: RandomizerNode) -> bool:
         return node.index == previous_node.index + 1
 
-    @staticmethod
-    def is_open(node: RandomizerNode) -> bool:
-        return (len(node.inputs) if node.inputs else 0)  < node.operator.input_num
+    @classmethod
+    def num_of_open_inputs(cls, node: RandomizerNode) -> int:
+        return node.inputs.count(None)
+
+    @classmethod
+    def num_of_closed_inputs(cls, node: RandomizerNode) -> int:
+        return node.operator.input_num - cls.num_of_open_inputs(node)
 
+    @classmethod
+    def is_open(cls, node: RandomizerNode) -> bool:
+        return cls.num_of_open_inputs(node) > 0
+
+    # TODO replace list with generator
     @classmethod
     def get_open_nodes(cls, nodes: List[RandomizerNode]) -> List[RandomizerNode]:
         return [node for node in nodes if cls.is_open(node)]
 
+    @classmethod
+    def has_open_input_with_input_shape(cls, node: RandomizerNode, input_shape: TensorShape) -> bool:
+        for i, open_input in enumerate(node.inputs):
+            if open_input is None:
+                if input_shape == node.input_shapes[i]:
+                    return True
+        return False
+
+    @classmethod
+    def get_open_input_indices(cls, node: RandomizerNode) -> Generator[int, None, None]:
+        for i, open_input in enumerate(node.inputs):
+            if open_input is None:
+                yield i
+
+    # TODO replace list with generator
     @classmethod
     def get_open_nodes_with_input_shape(cls, nodes: List[RandomizerNode], input_shape: TensorShape) -> List[RandomizerNode]:
-        # TODO support checking not just next operand but all not connected operands
-        return [node for node in nodes if cls.is_open(node) and node.input_shapes[len(node.inputs)] == input_shape]
+        return [node for node in nodes if cls.is_open(node) and cls.has_open_input_with_input_shape(node, input_shape)]
 
     @classmethod
     def calc_input_shapes(cls, node: RandomizerNode, rng_shape: random.Random) -> List[TensorShape]:
@@ -203,3 +226,25 @@ def format_tensors(cls, tensors: List[pybuda.Tensor]):
     @classmethod
     def debug_inputs(cls, inputs: List[pybuda.Tensor]):
         logger.info(f"inputs: {cls.format_tensors(inputs)}")
+
+
+class RateLimitter:
+    '''Rate limitter class to limit the number of allowed operations by a rate limit factor'''
+
+    def __init__(self, rng: random.Random, max_limit: int, current_limit: int):
+        self.rng = rng
+        self.max_limit = max_limit
+        self.current_limit = current_limit
+        self.current_value: int = None
+
+    def is_allowed(self) -> bool:
+        '''Check if the operation is allowed by the rate limit factor and current random value'''
+        self.current_value = self.rng.randint(0, 100)
+        return self.current_value < self.current_limit
+    
+    def limit_info(self) -> str:
+        '''Return the rate limit info for previous operation'''
+        if self.current_value < self.current_limit:
+            return f"{self.current_value} < {self.current_limit}"
+        else:
+            return f"{self.current_value} >= {self.current_limit}"
diff --git a/pybuda/test/utils.py b/pybuda/test/utils.py
index 8d8d2e30..46c73f07 100644
--- a/pybuda/test/utils.py
+++ b/pybuda/test/utils.py
@@ -25,11 +25,13 @@ def download_model(download_func, *args, num_retries=3, timeout=180, **kwargs):
     
 
 class Timer:
+    '''Timer class to measure the duration of a code block'''
 
     def __init__(self):
         self.start_time = time.perf_counter()
 
     def get_duration(self):
+        '''Calculate the duration of the code block in seconds'''
         end_time = time.perf_counter()
         duration = end_time - self.start_time
         return duration

From 3ad8ec12c1e50dd709145512de497e8c0ff81b40 Mon Sep 17 00:00:00 2001
From: svuckovic <svuckovic@tenstorrent.com>
Date: Tue, 2 Jul 2024 15:55:42 +0000
Subject: [PATCH 017/116] [tti] add warning when trying to save TTI image with
 CPUDevice

(cherry picked from commit 3e10284460f4a12c083f79bf1324f7da8e037e51)
---
 pybuda/pybuda/ttdevice.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pybuda/pybuda/ttdevice.py b/pybuda/pybuda/ttdevice.py
index 67b9c5f9..708864ea 100644
--- a/pybuda/pybuda/ttdevice.py
+++ b/pybuda/pybuda/ttdevice.py
@@ -1532,6 +1532,9 @@ def compile_to_image(
                 _device_mode=self.device_mode
             )
 
+        if self.cpu_fallback_device_pre is not None or self.cpu_fallback_device_post is not None:
+            logger.warning("CPU fallback devices are not supported when compiling to TTI image. Only TTDevice will be saved to TTI image. Loading the image will probably end in an error.")
+
         from .tti import TTDeviceImage
         device_image = TTDeviceImage.create_image_from_device(
             self, 

From d49ff0b2459b72359a7a25ec67270e768bdd0105 Mon Sep 17 00:00:00 2001
From: Deepak Sudhakar <dsudhakar@tenstorrent.com>
Date: Fri, 5 Jul 2024 12:03:22 +0000
Subject: [PATCH 018/116] Remove few models in tests_B

(cherry picked from commit 1525ba8804be0bcc2a41759d91bdbc66ca43f06a)
---
 .../tvm/cnn/pytorch/tests_B/test_alphapose.py |  48 -------
 .../tvm/cnn/pytorch/tests_B/test_fastdepth.py |  47 -------
 .../tvm/cnn/pytorch/tests_B/test_ghostnet.py  |  82 ------------
 .../tvm/cnn/pytorch/tests_B/test_graph_cnn.py |  80 ------------
 .../tvm/cnn/pytorch/tests_B/test_hf_clip.py   |  80 ------------
 .../cnn/pytorch/tests_B/test_mobilenet_v2.py  |  98 ---------------
 .../cnn/pytorch/tests_B/test_mobilenet_v3.py  |  90 -------------
 .../tvm/cnn/pytorch/tests_B/test_regnety.py   |  42 -------
 .../tvm/cnn/pytorch/tests_B/test_resnet.py    | 118 ------------------
 .../tvm/cnn/pytorch/tests_B/test_resnext.py   |  57 ---------
 .../cnn/pytorch/tests_B/test_shufflenet.py    |  45 -------
 .../test/tvm/cnn/pytorch/tests_B/test_ssd.py  |  49 --------
 .../test/tvm/cnn/pytorch/tests_B/test_vgg.py  |  45 -------
 .../test/tvm/cnn/pytorch/tests_B/test_vilt.py |  72 -----------
 .../test/tvm/cnn/pytorch/tests_B/test_vit.py  | 107 ----------------
 15 files changed, 1060 deletions(-)
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_alphapose.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_fastdepth.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_graph_cnn.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_hf_clip.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_regnety.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_shufflenet.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_ssd.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_vilt.py
 delete mode 100644 pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py

diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_alphapose.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_alphapose.py
deleted file mode 100644
index dacc2a9f..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_alphapose.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-import os
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-from test.tvm.cnn.pytorch.alphapose.utils.config import update_config
-from test.tvm.cnn.pytorch.alphapose.models import builder
-
-def test_alphapose(test_kind, test_device):
-    pytest.skip("Has non-singleton 6D shapes, skip for now")
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-    dir_path = os.path.dirname(os.path.realpath(__file__)) + "/../alphapose"
-    cfg = update_config(dir_path + "/256x192_res50_lr1e-3_1x.yaml")
-
-    model = builder.build_sppe(cfg["MODEL"], preset_cfg=cfg["DATA_PRESET"])
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    module = PyTorchModule("alphapose", model)
-
-    input_shape = (1, 3, 256, 192)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            verify_all=True,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_fastdepth.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_fastdepth.py
deleted file mode 100644
index a1009790..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_fastdepth.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-from test.tvm.cnn.pytorch.fastdepth.models import MobileNetSkipAdd
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-def test_fastdepth_pytorch(test_kind, test_device):
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    model = MobileNetSkipAdd(pretrained=False)
-
-    module = PyTorchModule("fastdepth_torch", model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
deleted file mode 100644
index 5eab8f1d..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-import importlib
-import urllib
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-def test_ghostnet(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    #Fusing disabled due to tenstorrent/pybuda#800
-    compiler_cfg.enable_auto_fusing=False
-
-    # tenstorrent/pybuda#392
-    import os
-    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
-    
-    # model = torch.hub.load('huawei-noah/ghostnet', 'ghostnet_1x', pretrained=True)
-    # Top file from torch hub depends on cuda import, so just get the model directly. 
-    localfile, _ = urllib.request.urlretrieve("https://github.com/huawei-noah/ghostnet/raw/master/ghostnet_pytorch/ghostnet.py")
-    ghostnet_module = importlib.machinery.SourceFileLoader("ghostnet", localfile).load_module()
-    state_dict_url = 'https://github.com/huawei-noah/ghostnet/raw/master/ghostnet_pytorch/models/state_dict_73.98.pth'
-    model = ghostnet_module.ghostnet(num_classes=1000, width=1.0, dropout=0.2)
-    state_dict = torch.hub.load_state_dict_from_url(state_dict_url, progress=True)
-    model.load_state_dict(state_dict)
-    module = PyTorchModule("ghostnet", model.float())
-
-    input_shape = (1, 3, 256, 256)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.9,
-        ),
-    )
-
-def test_ghostnet_v2(test_kind, test_device):
-    
-    pytest.skip("Needs padding")
-    
-     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = _get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.balancer_policy = "CNN"
- 
-    # Model load
-    localfile, _ = urllib.request.urlretrieve("https://github.com/huawei-noah/ghostnet/raw/master/ghostnetv2_pytorch/model/ghostnetv2_torch.py")
-    ghostnetv2_module = importlib.machinery.SourceFileLoader("ghostnetv2", localfile).load_module()
-    model = ghostnetv2_module.ghostnetv2(num_classes=1000, width=1.6, dropout=0.2, args=None) # width = 1 | 1.3 | 1.6
-    model.eval()
-
-    module = PyTorchModule("pt_ghostnet_v2", model.float())
-
-    input_shape = (1, 3, 256, 256)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_graph_cnn.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_graph_cnn.py
deleted file mode 100644
index e22ebf1a..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_graph_cnn.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Simple Graph CNN basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-# from torch_geometric.nn import GCNConv
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_graph_cnn(test_kind, test_device):
-    # Scatter Addition op is not supported in PyBuda. Can be revised
-    # once embeddings (over take op) are supported on HW side
-    pytest.skip()
-
-    if test_kind == TestKind.TRAINING:  
-        # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    class GCNWrapper(torch.nn.Module):
-        def __init__(self) -> None:
-            super().__init__()
-
-            self.num_features = 1433
-            self.num_classes = 7
-            self.conv = GCN(self.num_features, self.num_classes)
-            self.edge_index = torch.randint(0, 2708, (1, 2, 10556), dtype=torch.int64)
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return self.conv(x, self.edge_index)
-
-    class GCN(torch.nn.Module):
-        def __init__(self, in_channels, out_channels):
-            super().__init__()
-            self.conv = GCNConv(in_channels, out_channels)
-
-        def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
-            if len(x.shape) != 2:
-                x = x.squeeze(0)
-
-            if len(edge_index.shape) != 2:
-                edge_index = edge_index.squeeze(0)
-
-            x = self.conv(x, edge_index)
-
-            return x
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.START_COMPILE
-    else:
-        compiler_cfg.compile_depth = CompileDepth.START_COMPILE
-
-    pytorch_model = GCNWrapper()
-    module = PyTorchModule("graph_cnn", pytorch_model)
-
-    x = torch.rand((1, 2708, 1433))
-
-    verify_module(
-        module,
-        (),
-        inputs=[(x,),],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_hf_clip.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_hf_clip.py
deleted file mode 100644
index 280d0e0b..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_hf_clip.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-
-from PIL import Image
-import requests
-
-from transformers import CLIPProcessor, CLIPModel, CLIPConfig
-
-import pybuda
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-class ClipWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_ids, pixel_values, attention_mask):
-        outputs = self.model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-        return outputs.logits_per_image
-
-
-def test_hf_clip(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    compiler_cfg.retain_tvm_python_files = True
-    
-    config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32")
-    config.text_config.num_hidden_layers=1
-    config.vision_config.num_hidden_layers=1
-    model = CLIPModel(config)
-
-    # model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-    pad_len = 32 - (inputs["input_ids"].shape[1] % 32)
-    input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, pad_len))
-    attention_mask = torch.nn.functional.pad(inputs["attention_mask"], (0, pad_len))
-    pixel_values = inputs["pixel_values"]
-
-    verify_module(
-        PyTorchModule("clip", ClipWrapper(model)),
-        (input_ids.shape, pixel_values.shape, attention_mask.shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        inputs=[(input_ids, pixel_values, attention_mask)],
-    )
-    # tt0 = pybuda.TTDevice("tt0", 
-    #         devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("clip", ClipWrapper(model)))
-    # tt0.push_to_inputs((input_ids, pixel_values, attention_mask))
-    # output_q = pybuda.run_inference()
-    # outputs = output_q.get()
-
-    # outputs = model(input_ids, pixel_values, attention_mask)
-    # logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-    # probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
deleted file mode 100644
index 9287d34e..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-from transformers import MobileNetV2FeatureExtractor, MobileNetV2ForSemanticSegmentation
-from transformers import AutoImageProcessor
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from pybuda import DataFormat
-from test.utils import download_model
-
-
-def test_mobilenetv2_pytorch(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    # tenstorrent/pybuda#392
-    import os
-    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "mobilenet_v2", pretrained=True
-    )
-    module = PyTorchModule("mobilenetv2", model)
-
-    if test_device.is_silicon():
-        pcc = 0.95
-    else:
-        pcc = 0.99
-
-    input_shape = (1, 3, 224, 224)
-    
-    # NOTE: On silicon, this model has a higher PCC when compared to framework when using Float16 (rather than Float16_b) as the fp32_fallback
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-            fp32_fallback=DataFormat.Float16
-        ),
-    )
-
-def test_mobilenetv2_deeplab(test_kind, test_device):
-    
-    pytest.skip("Needs padding")
-    
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    # SET CONV PADDING ENVIRONMENT VARIABLE: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/36
-    import os
-    os.environ["PYBUDA_PAD_SPARSE_MM"] = "{25:26}"
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    model = download_model(MobileNetV2ForSemanticSegmentation.from_pretrained, "Matthijs/deeplabv3_mobilenet_v2_1.0_513")
-    module = PyTorchModule("mobilenetv2_deeplab", model)
-
-    input_shape = (1, 3, 224, 224)
-    
-    # NOTE: On silicon, this model has a higher PCC when compared to framework when using Float16 (rather than Float16_b) as the fp32_fallback
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            fp32_fallback=DataFormat.Float16
-        ),
-    )
\ No newline at end of file
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
deleted file mode 100644
index 7a88b92c..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# MobileNet v1 basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-from torch import nn
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from pybuda import DataFormat
-from test.utils import download_model
-
-
-def test_mobilenet_v3_small(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    # tenstorrent/pybuda#392
-    import os
-    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
-
-    model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "mobilenet_v3_small", pretrained=True
-    )
-    module = PyTorchModule("mobilenet_v3_small", model,)
-
-    if test_device.is_silicon():
-        pcc = 0.8
-    else:
-        pcc = 0.99
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-            fp32_fallback=DataFormat.Float16
-        ),
-    )
-
-
-def test_mobilenet_v3_large(test_kind, test_device):
-    pytest.skip()  # if small mobilenet passes than we assume that the larger also passes
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "mobilenet_v3_large", pretrained=True
-    )
-    module = PyTorchModule("mobilenet_v3_large", model,)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_regnety.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_regnety.py
deleted file mode 100644
index 34d8e7cb..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_regnety.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-    CompileDepth,
-)
-from pybuda.verify.backend import verify_module
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify.config import TestKind
-
-import timm
-
-def test_regnety_002_pytorch(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    model = model = timm.create_model('regnety_002', pretrained=True)
-    module = PyTorchModule("regnety_002", model)
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
deleted file mode 100644
index ce8c3a25..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-import pybuda
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.verify.config import TestKind
-from pybuda.verify.backend import verify_module
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from test.utils import download_model
-
-
-def get_relaxed_atol_pcc(test_kind, test_device, microbatch_size=1):
-    """
-    Figure out reasonable pcc/atol for training on silicon
-    """
-    training_atol = 0.3
-    training_pcc = 0.95
-    if test_device.is_silicon():
-        training_pcc = 0.8
-        training_atol = 0.55
-    inference_atol = 0.1
-    inference_pcc = 0.95
-    relative_atol = training_atol if test_kind.is_training() else inference_atol
-    if test_device.is_silicon() and test_kind.is_training():
-        relative_atol *= 3.5
-    pcc = training_pcc if test_kind.is_training() else inference_pcc
-
-    return relative_atol, pcc
-
-
-def test_resnet_pytorch(test_kind, test_device):
-    # Always run with recompute in post-commit CI. Nightly tests both
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-        # compiler_cfg.compile_depth = CompileDepth.FULL
-    compiler_cfg.balancer_policy = "CNN"
-    # compiler_cfg.place_on_new_epoch("max_pool2d_14.dc.reshape.0_operand_commute_clone411.dc.sparse_matmul.4.lc2")
-
-    # Issue below is still valid, though it doesn't trigger when fracturing is turned on
-    # tenstorrent/pybuda#310
-    #pybuda.config.override_t_stream_shape(
-    #    "conv2d_0.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", (28, 1)
-    #)
-
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "resnet18", pretrained=True)
-    module = PyTorchModule("pt_resnet", model)
-
-    input_shape = (1, 3, 224, 224)
-    relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            relative_atol=relative_atol,
-            pcc=pcc,
-        ),
-    )
-
-
-def test_resnet_pytorch_instance_norm(test_kind, test_device):
-    pytest.skip()  # WIP
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    # TODO: Remove
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # compiler_cfg.compile_depth = CompileDepth.BALANCER_PASS
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.place_on_new_epoch("conv2d_0.dc.reshape.15.dc.sparse_matmul.1.lc2")
-
-    from torchvision.models import resnet18
-    model = resnet18(norm_layer=torch.nn.InstanceNorm2d)
-    module = PyTorchModule("pt_resnet_instance_norm", model)
-
-    input_shape = (1, 3, 224, 224)
-    out = model(torch.rand(input_shape))
-
-    relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            relative_atol=relative_atol,
-            pcc=pcc,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
deleted file mode 100644
index 4a7f2656..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-    BackendType,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_resnext(test_kind, test_device):
-    if test_device.devtype != BackendType.Silicon:
-        pytest.skip()  # Takes too long in post commit
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    #import pybuda
-    # tenstorrent/pybuda#310
-    #pybuda.config.override_t_stream_shape("conv2d_0.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", (28, 1))
-
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "resnext50_32x4d", pretrained=True)
-    module = PyTorchModule("resnext50_32x4d", model)
-
-    input_shape = (1, 3, 224, 224)
-    pcc = 0.97 if test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_shufflenet.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_shufflenet.py
deleted file mode 100644
index 28e0850e..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_shufflenet.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_shufflenetv2_pytorch(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    model = download_model(torch.hub.load, 
-        "pytorch/vision:v0.10.0", "shufflenet_v2_x1_0", pretrained=True
-    )
-    module = PyTorchModule("shufflenet_v2_x1_0", model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ssd.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_ssd.py
deleted file mode 100644
index c4707c39..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ssd.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Single Shot MultiBox Detector (SSD) basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from .SSD.ssd import SSD
-
-
-def test_ssd(test_kind, test_device):
-    pytest.skip()  # Testing full models only on nightly.
-
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    framework_model = SSD()
-    input_shape = (1, 3, 300, 300)
-    
-    module = PyTorchModule(
-        "ssd_full_model",
-        framework_model
-    )
-    
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
deleted file mode 100644
index 7e1b2add..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_vgg_pytorch(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-
-    model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "vgg11", pretrained=True)
-    module = PyTorchModule("vgg11", model)
-
-    input_shape = (1, 3, 224, 224)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vilt.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vilt.py
deleted file mode 100644
index ec2a015b..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vilt.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-from transformers import ViltModel, ViltConfig
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_vision_language_transformer_encoder(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-        
-    #if test_kind.is_training():
-    #    pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    config = ViltConfig()
-    config.num_hidden_layers = 1
-    model = ViltModel(config)
-    module = PyTorchModule("VisLanguageTransformerEncoder", model.encoder)
-
-    input_shape = (1, 197, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"layer.0.attention.attention.key.bias"},
-            pcc=0.9
-        ),
-    )
-
-
-def test_tvm_vision_language_transformer_pooler(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    config = ViltConfig()
-    config.num_hidden_layers = 1
-    model = ViltModel(config)
-    module = PyTorchModule("VisLanguageTransformerPooler", model.pooler)
-
-    input_shape = (1, 197, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.97
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py
deleted file mode 100644
index 86540857..00000000
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import pytest
-
-import torch
-from transformers import ViTModel, ViTConfig
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-import pybuda
-
-
-def test_tvm_visual_transformer(test_kind, test_device):
-    if test_device.arch == pybuda.BackendDevice.Grayskull:
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip() 
-
-    # Compiler configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    # Load model
-    config = ViTConfig()
-    config.num_attention_heads = 1
-    config.num_hidden_layers = 1
-    framework_model = ViTModel(config)
-    pybuda_model = PyTorchModule("pt_visual_transformer", framework_model)
-    
-    # Sanity run
-    input_shape = (1, 3, 224, 224)
-    out = framework_model(torch.rand(input_shape))
-    
-    verify_module(
-        pybuda_model,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_tvm_visual_transformer_encoder(test_kind, test_device):
-    pytest.skip("Tested in full model test")
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip() 
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    config = ViTConfig()
-    config.num_hidden_layers = 1
-    model = ViTModel(config)
-    module = PyTorchModule("VisualTransformerEncoder", model.encoder)
-
-    input_shape = (1, 197, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"layer.0.attention.attention.key.bias"},
-            pcc=0.89
-        ),
-    )
-
-
-def test_tvm_visual_transformer_pooler(test_kind, test_device):
-    pytest.skip("Tested in full model test")
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-
-    config = ViTConfig()
-    config.num_hidden_layers = 1
-    model = ViTModel(config)
-    module = PyTorchModule("VisualTransformerPooler", model.pooler)
-
-    input_shape = (1, 197, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.97,
-        ),
-    )

From efb70da7f895b6e152c5f846f8f2d9a018a1ad5c Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Fri, 5 Jul 2024 15:30:32 +0000
Subject: [PATCH 019/116] BBE update to bbe_to_pybuda_release_20240612_week24

(cherry picked from commit f855d06590f177c07e7913ce2f2689ce1cef1f1a)
---
 pybuda/csrc/backend_api/module.mk                  |  4 +++-
 pybuda/pybuda/tools/run_net2pipe.py                |  5 ++++-
 .../high_prio/cnn/pytorch/test_mlp_mixer.py        |  2 +-
 setup.py                                           | 14 ++++++++++++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/pybuda/csrc/backend_api/module.mk b/pybuda/csrc/backend_api/module.mk
index 04e772fd..8d1ac084 100644
--- a/pybuda/csrc/backend_api/module.mk
+++ b/pybuda/csrc/backend_api/module.mk
@@ -12,6 +12,7 @@ BUDABACKEND_LIB = $(BUDABACKEND_LIBDIR)/libtt.so
 BUDABACKEND_DEVICE = $(BUDABACKEND_LIBDIR)/libdevice.so
 BUDABACKEND_NET2PIPE = third_party/budabackend/build/bin/net2pipe
 BUDABACKEND_PIPEGEN = third_party/budabackend/build/bin/pipegen2
+BUDABACKEND_BLOBGEN = third_party/budabackend/build/bin/blobgen2
 
 PYBUDA_CSRC_BACKENDAPI_LIB = $(LIBDIR)/libbackend_api.a
 PYBUDA_CSRC_BACKENDAPI_SRCS += \
@@ -45,8 +46,9 @@ $(BUDABACKEND_DEVICE): third_party/budabackend ;
 $(BUDABACKEND_LIB):  third_party/budabackend ;
 $(BUDABACKEND_NET2PIPE): third_party/budabackend ;
 $(BUDABACKEND_PIPEGEN): third_party/budabackend ;
+$(BUDABACKEND_BLOBGEN): third_party/budabackend ;
 
-third_party/budabackend/src/net2pipe: $(BUDABACKEND_NET2PIPE) $(BUDABACKEND_PIPEGEN) ;
+third_party/budabackend/src/net2pipe: $(BUDABACKEND_NET2PIPE) $(BUDABACKEND_PIPEGEN) $(BUDABACKEND_BLOBGEN) ;
 
 # Each module has a top level target as the entrypoint which must match the subdir name
 pybuda/csrc/backend_api: $(PYBUDA_CSRC_BACKENDAPI_LIB) $(BUDABACKEND_LIB) $(BUDABACKEND_DEVICE) $(PYBUDA_CSRC_SHARED_UTILS_LIB) ;
diff --git a/pybuda/pybuda/tools/run_net2pipe.py b/pybuda/pybuda/tools/run_net2pipe.py
index 8d11421d..817e0ea5 100755
--- a/pybuda/pybuda/tools/run_net2pipe.py
+++ b/pybuda/pybuda/tools/run_net2pipe.py
@@ -36,7 +36,10 @@ def generate_blobgen_cmd(
     temporal_epoch,
     chip_ids,
 ):
-    blobgen_exe = root + "/src/overlay/blob_gen.rb"
+    # TODO: This blobgen is deprecated. Use src/blobgen2 c++ code.
+    # Even further, this whole file shouldn't exist. There are exactly the same python
+    # tools located in third_party/budabackend/verif/common
+    blobgen_exe = root + "/tb/llk_tb/overlay/blob_gen.rb"
     temporal_epoch_graph_name = "pipegen_epoch" + str(temporal_epoch)
  
     # parse general spec
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
index 518940d4..7531f575 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
@@ -56,7 +56,7 @@ def test_mlp_mixer_timm_pytorch(variant, test_device):
             elif variant == "mixer_l16_224_in21k":
                 pytest.skip("Bus Error during placer/balancer")
             elif variant == "mixer_s16_224":
-                pytest.skip("/home/jenkinsad/pybuda/third_party/budabackend//src/overlay/blob_gen.rb:250:in `ParseStreamString': undefined method `map' for nil:NilClass (NoMethodError)")
+                pytest.skip("/home/jenkinsad/pybuda/third_party/budabackend//tb/llk_tb/overlay/blob_gen.rb:250:in `ParseStreamString': undefined method `map' for nil:NilClass (NoMethodError)")
             elif variant == "mixer_s32_224":
                 pytest.skip("Hangs on Grayskull")
 
diff --git a/setup.py b/setup.py
index b9f2b95a..9f8afac4 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
     },
     "bin": {
         "path": "build/bin" ,
-        "files": ["net2pipe", "pipegen2", "op_model", "dpnra"],
+        "files": ["net2pipe", "pipegen2", "blobgen2", "op_model", "dpnra"],
     },
     "device_descriptors": {
         "path": "device",
@@ -42,6 +42,8 @@
             "blackhole_1x1.yaml",
             "blackhole_8x10.yaml",
             "blackhole_80_arch.yaml",
+            "blackhole_10x14.yaml",
+            "blackhole_10x14_no_eth.yaml",
 
         ]
     },
@@ -130,10 +132,18 @@
             "vcdparse.py",
         ]
     },
+    # TODO: cleanup, this is deprecated.
     "overlay": {
-        "path": "src/overlay",
+        "path": "tb/llk_tb/overlay",
         "files": "*" # TODO, clean-up, don't need everything
     },
+    # TODO: cleanup, see if this should be on some other section.
+    "blobgen2_cpp_overlay": {
+        "path": "src/blobgen2",
+        "files": [
+            "blob_init.hex.static"
+        ]
+    },
     "versim_lib": { # TODO, remove
         "path": "common_lib",
         "files": "*",

From 0fa05f10f78313d1e5bbf75a262890bb8de0b2aa Mon Sep 17 00:00:00 2001
From: mramanathan <mramanathan@tenstorrent.com>
Date: Mon, 8 Jul 2024 10:25:27 +0000
Subject: [PATCH 020/116] Add fix for pybuda nighly failures

(cherry picked from commit d87b48a089ac45da53ba84aa84297a66dbb42b57)
---
 .../high_prio/cnn/pytorch/test_blazepose.py         |  3 ++-
 .../high_prio/cnn/pytorch/test_efficientnet.py      | 13 ++++++++++++-
 .../model_demos/high_prio/cnn/pytorch/test_vgg.py   |  4 +++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
index b8a2e556..f302119e 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
@@ -162,7 +162,8 @@ def test_blaze_palm_pytorch_1x1(test_device):
     # Set PyBDUA environment variable
     os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
-
+    os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+    
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index 37f6bb35..33040944 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -11,6 +11,10 @@
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
 from loguru import logger
+import torchvision
+from torchvision.models import efficientnet_b4, efficientnet_b0, EfficientNet_B4_Weights, EfficientNet_B0_Weights
+from torchvision.models._api import WeightsEnum
+from torch.hub import load_state_dict_from_url
 
 import pybuda
 from pybuda import VerifyConfig
@@ -32,6 +36,10 @@
     # "hf_hub:timm/tf_efficientnetv2_s.in21k",
 ]
 
+def get_state_dict(self, *args, **kwargs):
+    kwargs.pop("check_hash")
+    return load_state_dict_from_url(self.url, *args, **kwargs)
+WeightsEnum.get_state_dict = get_state_dict
 
 @pytest.mark.parametrize("variant", variants)
 def test_efficientnet_timm(variant, test_device):
@@ -168,7 +176,10 @@ def test_efficientnet_torchvision(variant, test_device):
 
 
     # Load model
-    framework_model = download_model(variant, pretrained=True)
+    if variant == models.efficientnet_b0:
+        framework_model = efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
+    elif variant == models.efficientnet_b4:
+        framework_model = efficientnet_b4(weights=EfficientNet_B4_Weights.IMAGENET1K_V1)
     framework_model.eval()
     pybuda_model = pybuda.PyTorchModule("pt_effnet_torchvis", framework_model)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
index 35965dc3..84cb1ed3 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
@@ -40,7 +40,9 @@ def test_vgg_osmr_pytorch(variant, test_device):
         os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
         os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
         os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
-
+        if variant == "vgg19":
+            compiler_cfg.place_on_new_epoch("conv2d_32.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2")
+            compiler_cfg.place_on_new_epoch("matmul_39")
     # STEP 2: Create PyBuda module from PyTorch model
     # Variants: 
     #['vgg11', 'vgg13', 'vgg16', 'vgg19', 

From 0521ab991a104e80c89aa7582665d200f404e904 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Mon, 8 Jul 2024 14:43:20 +0000
Subject: [PATCH 021/116] Remove Partially compiled models

(cherry picked from commit 178cbb086447abdfd03e36fdd490493db1d58349)
---
 .../cnn/tensorflow/tests_A/test_convnext.py   |  55 ---------
 .../tvm/cnn/tensorflow/tests_A/test_resnet.py |  85 --------------
 .../cnn/tensorflow/tests_A/test_xception.py   |  52 ---------
 .../tensorflow/tests_B/test_autoencoder.py    | 100 ----------------
 .../tensorflow/tests_B/test_efficientnet.py   |  36 +-----
 .../cnn/tensorflow/tests_B/test_inception.py  |  41 -------
 .../tvm/cnn/tensorflow/tests_B/test_mnist.py  |  75 ------------
 .../cnn/tensorflow/tests_B/test_mobilenet.py  | 108 ------------------
 .../cnn/tensorflow/tests_B/test_regnety.py    |  55 ---------
 9 files changed, 1 insertion(+), 606 deletions(-)
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_A/test_resnet.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_A/test_xception.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_B/test_autoencoder.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_B/test_mnist.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
 delete mode 100644 pybuda/test/tvm/cnn/tensorflow/tests_B/test_regnety.py

diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py b/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
deleted file mode 100644
index d0e3cc51..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import torch
-
-from transformers import TFConvNextModel, ConvNextConfig
-
-from pybuda import (
-    PyTorchModule,
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-import tensorflow as tf
-
-from pybuda.verify.backend import verify_module
-
-def test_tvm_convnext(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    # tenstorrent/pybuda#842
-    compiler_cfg.compile_depth = (
-        CompileDepth.BUDA_GRAPH_PRE_PLACER
-    )
-
-    cfg = ConvNextConfig.from_pretrained("facebook/convnext-tiny-224")
-    framework_model = TFConvNextModel(cfg).convnext.encoder.stages[0].layers[0].dwconv
-    
-    
-    module = TFModule("convnext_full_model_tiny_tf", framework_model)
-    input_shape = (1, 384, 96, 96)
-    x = tf.random.normal(input_shape)
-    framework_model(x)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_resnet.py b/pybuda/test/tvm/cnn/tensorflow/tests_A/test_resnet.py
deleted file mode 100644
index 9f89e510..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_resnet.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-import tensorflow as tf
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-
-def test_tvm_resnet_tf(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = tf.keras.applications.resnet50.ResNet50(
-        include_top=True,
-        weights='imagenet',
-        input_tensor=None,
-        input_shape=None,
-        pooling=None,
-        classes=1000,
-    )
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    
-    mod = TFModule("resnet50_tf", model)
-
-    verify_module(
-        mod,
-        ((1, 224, 224, 3),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_tvm_resnet_rs_tf(test_kind, test_device):
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = tf.keras.applications.resnet_rs.ResNetRS50(
-        include_top=True,
-        weights='imagenet',
-        classes=1000,
-        input_shape=None,
-        input_tensor=None,
-        pooling=None,
-        classifier_activation='softmax',
-        include_preprocessing=True
-    )
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    mod = TFModule("resnet50_rs_tf", model)
-
-    verify_module(
-        mod,
-        ((1, 224, 224, 3),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_xception.py b/pybuda/test/tvm/cnn/tensorflow/tests_A/test_xception.py
deleted file mode 100644
index 8c9647c0..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_xception.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-import tensorflow as tf
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-
-def test_xception(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = tf.keras.applications.Xception(
-        include_top=True,
-        weights='imagenet',
-        input_tensor=None,
-        input_shape=None,
-        pooling=None,
-        classes=1000,
-    )
-    
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    
-    mod = TFModule("xception_tf", model)
-
-    verify_module(
-        mod,
-        ((1, 299, 299, 3),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_autoencoder.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_autoencoder.py
deleted file mode 100644
index 489bf7db..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_autoencoder.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# AutoEncoder basic bring-up tests of tracing functionality
-#
-import pytest
-
-from keras import Model
-from keras import layers
-import tensorflow as tf
-
-from pybuda import (
-    TFModule,
-    VerifyConfig,
-)
-from pybuda.verify.config import TestKind
-from pybuda.verify.backend import verify_module
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-
-def test_conv_autoencoder(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class ConvAE(Model):
-        def __init__(self):
-            super(ConvAE, self).__init__()
-            self.encoder = tf.keras.Sequential(
-                [
-                    layers.Conv2D(
-                        filters=16,
-                        kernel_size=3,
-                        strides=2,
-                        activation="relu",
-                        padding="valid",
-                    ),
-                    layers.Conv2D(
-                        filters=8,
-                        kernel_size=3,
-                        strides=2,
-                        activation="relu",
-                        padding="valid",
-                    ),
-                ]
-            )
-
-            self.decoder = tf.keras.Sequential(
-                [
-                    layers.Conv2DTranspose(
-                        filters=8,
-                        kernel_size=3,
-                        strides=2,
-                        activation="relu",
-                        padding="valid",
-                    ),
-                    layers.Conv2DTranspose(
-                        filters=16,
-                        kernel_size=3,
-                        strides=2,
-                        activation="relu",
-                        padding="valid",
-                    ),
-                ]
-            )
-
-        def call(self, x):
-            encoded = self.encoder(x)
-            decoded = self.decoder(encoded)
-
-            return decoded
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        # Column dimension must be divisible by tile size
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    framework_model = ConvAE()
-    module = TFModule(
-        "tf_conv_autoencoder",
-        framework_model,
-    )
-
-    input_shape = (1, 28, 28, 3)
-
-    # Run model
-    # act = tf.random.uniform(input_shape)
-    # out = framework_model(act)
-
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
index d66b54f8..2b8807ec 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
@@ -72,38 +72,4 @@ def call(self, x):
     evaluate_framework_vs_pybuda(model, ret, act1)
 
 
-def test_efficientnet_layer(test_kind, test_device):
-    if (
-        test_kind == TestKind.TRAINING
-    ):  # Always run with recompute in post-commit CI. Nightly tests both
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.balancer_policy = "CNN"
-
-    blocks_args = [{
-        'kernel_size': 3,
-        'repeats': 1,
-        'filters_in': 32,
-        'filters_out': 16,
-        'expand_ratio': 1,
-        'id_skip': True,
-        'strides': 1,
-        'se_ratio': 0.25
-    }]
-
-    input_shape = (1, 32, 112, 112)
-    model = tf.keras.applications.EfficientNetB0(include_top=False, input_shape=input_shape[1:], blocks_args=blocks_args, weights=None)
-
-    mod = TFModule("efficientnet_b0_layer_tf", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
+
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
deleted file mode 100644
index f01b2790..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-
-import tensorflow as tf
-import pybuda
-from pybuda import (
-    TFModule,
-    VerifyConfig,
-    CompileDepth,
-)
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify.config import TestKind
-from pybuda.verify.backend import verify_module
-
-def test_inceptionv3_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    else:
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    input_shape = (1, 229, 229, 3)
-    model = tf.keras.applications.InceptionV3(include_top=False, input_shape=input_shape[1:])
-    mod = TFModule("inceptionv3_tf", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mnist.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mnist.py
deleted file mode 100644
index 721e5e94..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mnist.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import tensorflow as tf
-
-from pybuda import (
-    TFModule,
-    VerifyConfig,
-    CompileDepth,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-from pybuda.verify.config import TestKind
-
-input_shapes = [(1, 32, 32, 1)]
-
-@pytest.mark.parametrize(
-    "input_shape", input_shapes, ids=[f"input{str(s)}" for s in input_shapes]
-)
-def test_mnist_tensorflow(test_kind, test_device, input_shape):
-    if test_kind.is_training():
-        pytest.skip()  # Backward is currently unsupported
-    
-    class MNIST(tf.keras.Model):
-        def __init__(self):
-            super().__init__()
-
-            self.conv_padding = tf.keras.layers.ZeroPadding2D(5//2)
-            self.conv1 = tf.keras.layers.Conv2D(32, 5, padding="valid")
-            self.maxpool_padding = tf.keras.layers.ZeroPadding2D(1)
-            self.max_pool = tf.keras.layers.MaxPooling2D(pool_size=3, padding="valid")
-            self.conv2 = tf.keras.layers.Conv2D(64, 5, padding="valid")
-            self.conv2_drop = tf.keras.layers.SpatialDropout2D(0.5)
-            self.fc1 = tf.keras.layers.Dense(320, activation="relu")
-            self.drop = tf.keras.layers.Dropout(0.5)
-            self.fc2 = tf.keras.layers.Dense(10)
-
-        def call(self, x):
-            x = self.conv_padding(x)
-            x = self.conv1(x)
-            x = self.maxpool_padding(x)
-            x = self.max_pool(x)
-            x = tf.keras.activations.relu(x)
-            x = self.conv_padding(x)
-            x = self.conv2(x)
-            x = self.conv2_drop(x)
-            x = self.maxpool_padding(x)
-            x = self.max_pool(x)
-            x = tf.keras.activations.relu(x)
-            x = tf.reshape(x, (-1, 1024))
-            x = self.fc1(x)
-            x = self.drop(x)
-            x = self.fc2(x)
-            x = tf.math.softmax(x)
-            return tf.math.log(x)
-
-    model = MNIST()
-    mod = TFModule("mnist", model)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
deleted file mode 100644
index 750670c1..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import tensorflow as tf
-
-
-from pybuda import (
-    TTDevice,
-    pybuda_compile,
-    VerifyConfig,
-    TFModule,
-    CompilerConfig,
-    optimizers,
-    CompileDepth,
-    BackendType,
-    pybuda_reset,
-)
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify.config import TestKind
-from pybuda.verify.backend import verify_module
-
-def test_mobilenetv1_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.balancer_policy = "CNN"
-
-    input_shape = (1, 224, 224, 3)
-
-    act1 = tf.random.uniform(input_shape)
-
-    model = tf.keras.applications.MobileNet (
-        input_shape=input_shape[1:]
-    )
-    mod = TFModule("mobilenetv1_tf", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-def test_mobilenetv2_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.balancer_policy = "CNN"
-
-    input_shape = (1, 224, 224, 3)
-
-    model = tf.keras.applications.MobileNetV2 (
-        input_shape=input_shape[1:]
-    )
-    mod = TFModule("mobilenetv2_tf", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-def test_mobilenetv3_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.balancer_policy = "CNN"
-
-    input_shape = (1, 224, 224, 3)
-
-    model = tf.keras.applications.MobileNetV3Small(
-        input_shape=None,
-        alpha=1.0,
-        minimalistic=False,
-        include_top=True,
-        weights='imagenet',
-        input_tensor=None,
-        classes=1000,
-        pooling=None,
-        dropout_rate=0.2,
-        classifier_activation='softmax',
-        include_preprocessing=True
-    )
-    mod = TFModule("mobilenetv3_tf", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
\ No newline at end of file
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_regnety.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_regnety.py
deleted file mode 100644
index b33c4dba..00000000
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_regnety.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import torch
-
-from pybuda import (
-    PyTorchModule,
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.config import CompileDepth
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-
-import tensorflow as tf
-import tensorflow_hub as hub
-
-def test_tvm_regnety002_tf(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    model = tf.keras.applications.regnet.RegNetY002(
-        model_name='regnety002',
-        include_top=True,
-        include_preprocessing=True,
-        weights='imagenet',
-        input_tensor=None,
-        input_shape=None,
-        pooling=None,
-        classes=1000,
-        classifier_activation='softmax'
-    )
-
-    mod = TFModule("regnety002", model)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    verify_module(
-        mod,
-        ((1, 224, 224, 3),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )

From ac8b0d28fb61b3226589a2c2d757fcec6fc9d66f Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Mon, 8 Jul 2024 13:56:11 +0000
Subject: [PATCH 022/116] [Blackhole] Add 64 byte host queue alignment

(cherry picked from commit bcc2f2a6bd88ecad961a7d3045627465b8d08f94)
---
 pybuda/csrc/placer/host_memory.cpp           |  2 +-
 pybuda/csrc/placer/host_memory.hpp           |  1 +
 pybuda/csrc/placer/host_memory_allocator.cpp | 42 +++++++++++++-------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/pybuda/csrc/placer/host_memory.cpp b/pybuda/csrc/placer/host_memory.cpp
index dcebfb90..529c2708 100644
--- a/pybuda/csrc/placer/host_memory.cpp
+++ b/pybuda/csrc/placer/host_memory.cpp
@@ -14,7 +14,7 @@ namespace tt::placer
 {
 
 HostMemoryPlacerConfig::HostMemoryPlacerConfig(
-    const DeviceConfig& device_config, bool input_queues_on_host, bool output_queues_on_host)
+    const DeviceConfig& device_config, bool input_queues_on_host, bool output_queues_on_host) : device_config(device_config)
 {
     if (input_queues_on_host)
     {
diff --git a/pybuda/csrc/placer/host_memory.hpp b/pybuda/csrc/placer/host_memory.hpp
index b7454a1a..0a49069a 100644
--- a/pybuda/csrc/placer/host_memory.hpp
+++ b/pybuda/csrc/placer/host_memory.hpp
@@ -47,6 +47,7 @@ class HostChannelMemoryRegion
 // Host memory is divided into host channels, which are contiguous regions of memory.
 struct HostMemoryPlacerConfig
 {
+    const DeviceConfig& device_config;
     std::vector<HostChannelMemoryRegion> host_memory_regions;
     bool input_queues_on_host;
     bool output_queues_on_host;
diff --git a/pybuda/csrc/placer/host_memory_allocator.cpp b/pybuda/csrc/placer/host_memory_allocator.cpp
index 1958d015..13cd7dd4 100644
--- a/pybuda/csrc/placer/host_memory_allocator.cpp
+++ b/pybuda/csrc/placer/host_memory_allocator.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "placer/host_memory_allocator.hpp"
 
+#include "backend_api/device_config.hpp"
 #include "balancer/balancer.hpp"
 #include "graph_lib/node.hpp"
 #include "placer/allocator_utils.hpp"
@@ -12,22 +13,33 @@
 namespace tt::placer
 {
 
-// NB: To ensure device->host writes are 64B aligned(PCIE controller w/ 512-bit interface), we need to allocate 
-// addresses that are odd multiples of 32 bytes because we need to include the 32 byte tile header.
-// See BBE#2175 for more details.
-inline static std::uint32_t align_host_address(std::uint32_t address)
+inline static std::uint32_t align_address(std::uint32_t address, std::uint32_t alignment) 
 {
-    constexpr std::uint32_t alignment = 32;
-    // Add alignment to the address to ensure we go to the next multiple if not already at one
-    address += (alignment - 1);
-
-    // Align to the next even multiple of `alignment`
-    address &= ~uintptr_t(alignment - 1);
+    return (address + alignment - 1) & ~static_cast<std::uint32_t>(alignment - 1);
+}
 
-    // Check if the result is an odd multiple; if not, add another `alignment`
-    if ((address / alignment) % 2 == 0)
+inline static std::uint32_t align_host_address(std::uint32_t address, const DeviceConfig& device_config) {
+    if (device_config.is_blackhole())
+    {
+        // On blackhole starting addresses of host queues need to be 64 byte aligned.
+        //
+        address = align_address(address, 64 /* alignment */);
+    }
+    else
     {
-        address += alignment;
+        // NB: To ensure device->host writes are 64B aligned(PCIE controller w/ 512-bit interface), we need to allocate 
+        // addresses that are odd multiples of 32 bytes because we need to include the 32 byte tile header.
+        // See BBE#2175 for more details.
+        //
+        constexpr std::uint32_t alignment = 32;
+        address = align_address(address, alignment);
+
+        // Check if the result is an odd multiple; if not, add another alignment.
+        //
+        if ((address / alignment) % 2 == 0)
+        {
+            address += alignment;
+        }
     }
 
     return address;
@@ -35,12 +47,12 @@ inline static std::uint32_t align_host_address(std::uint32_t address)
 
 std::uint32_t HostMemoryAllocator::get_current_allocation_address() const
 {
-    return align_host_address(this->current_allocation_address);
+    return align_host_address(this->current_allocation_address, config.device_config);
 }
 
 void HostMemoryAllocator::increment_allocation_address(const std::uint32_t size)
 {
-    this->current_allocation_address = align_host_address(this->get_current_allocation_address() + size);
+    this->current_allocation_address = align_host_address(this->get_current_allocation_address() + size, config.device_config);
 }
 
 std::pair<std::uint32_t, std::uint32_t> HostMemoryAllocator::allocate_memory(const graphlib::Node* node, std::uint32_t queue_size)

From a236bed739f0191430bb82587fd09d3e91c35da3 Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Mon, 8 Jul 2024 11:47:15 +0000
Subject: [PATCH 023/116] [CCM] Reconstruct and reorganize internal and
 customer files and update ci script

(cherry picked from commit 12b01fb210508673e4658f8baa189b622cb37185)
---
 .../test/model_demos/high_prio/cnn/onnx/test_ddrnet.py |  4 ++--
 pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py |  2 +-
 .../model_demos/high_prio/cnn/onnx/test_hardnet.py     |  2 +-
 .../high_prio/cnn/onnx/test_perceiverio_conv.py        |  2 +-
 .../high_prio/cnn/onnx/test_perceiverio_fourier.py     |  2 +-
 .../high_prio/cnn/onnx/test_perceiverio_learned.py     |  2 +-
 .../model_demos/high_prio/cnn/onnx/test_retinanet.py   |  2 +-
 .../high_prio/cnn/onnx/test_segformer_imgcls_1.py      |  2 +-
 .../high_prio/cnn/onnx/test_segformer_imgcls_2.py      |  2 +-
 .../high_prio/cnn/onnx/test_segformer_seg_1.py         |  2 +-
 .../high_prio/cnn/onnx/test_segformer_seg_2.py         |  2 +-
 .../model_demos/high_prio/cnn/onnx/test_yolo_v5.py     |  6 +++---
 .../test/model_demos/high_prio/cnn/onnx/test_yolo_x.py |  2 +-
 .../model_demos/high_prio/cnn/pytorch/test_ddrnet.py   | 10 +++++-----
 .../model_demos/high_prio/cnn/pytorch/test_hardnet.py  |  2 +-
 .../model_demos/high_prio/cnn/pytorch/test_pidnet.py   |  4 ++--
 .../high_prio/cnn/pytorch/test_tri_basic_2.py          |  4 ++--
 17 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
index fb66bbcf..8d2fd159 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
@@ -38,7 +38,7 @@ def test_ddrnet(variant, test_device):
     model_name = f"{variant}_onnx"
 
     load_path = (
-        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+        f"third_party/confidential_customer_models/internal/ddrnet/files/onnx/{variant}.onnx"
     )
 
     model = onnx.load(load_path)
@@ -153,7 +153,7 @@ def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
     tt_model = pybuda.OnnxModule(model_name, model, load_path)
 
     # Prepare input
-    image_path = "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/image/road_scenes.png"
+    image_path = "third_party/confidential_customer_models/internal/ddrnet/files/samples/road_scenes.png"
     input_image = Image.open(image_path)
     input_image = transforms.Resize((1024, 1024))(input_image)
     input_tensor = transforms.ToTensor()(input_image)
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py
index 8a5815f4..194aca2e 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py
@@ -17,7 +17,7 @@ def test_fpn_onnx(test_device, test_kind):
     os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
 
     # Load FPN model
-    onnx_model_path = "third_party/confidential_customer_models/generated/files/fpn.onnx"
+    onnx_model_path = "third_party/confidential_customer_models/internal/fpn/files/onnx/fpn.onnx"
     model = onnx.load(onnx_model_path)
     tt_model = pybuda.OnnxModule("onnx_fpn", model, onnx_model_path)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
index 280008d3..bc784d3f 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
@@ -52,7 +52,7 @@ def test_hardnet_onnx(variant, test_device):
     img_tensor = input_tensor.unsqueeze(0)
 
     load_path = (
-        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+        f"third_party/confidential_customer_models/internal/hardnet/files/onnx/{variant}.onnx"
     )
     model_name = f"{variant}_onnx"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
index 197fabbf..c067b73a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_conv.py
@@ -58,7 +58,7 @@ def test_perceiverio_conv_imgcls_onnx(test_device):
             verify_enabled = False
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/perceiverio/files/onnx/"
         + str(model_name).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
index 1c575c31..e04902e1 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_fourier.py
@@ -52,7 +52,7 @@ def test_perceiverio_fourier_imgcls_onnx(test_device):
             verify_enabled = False
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/perceiverio/files/onnx/"
         + str(model_name).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
index 9b325e64..1fdf91e7 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
@@ -48,7 +48,7 @@ def test_perceiverio_learned_imgcls_onnx(test_device):
             verify_enabled = False
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/perceiverio/files/onnx/"
         + str(model_name).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
index 9aef59b9..af67f6d2 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
@@ -171,7 +171,7 @@ def test_retinanet_onnx(variant, test_device):
  
     # Prepare model
     load_path = (
-        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+        f"third_party/confidential_customer_models/internal/retinanet/files/onnx/{variant}.onnx"
     )
     model_name = f"onnx_{variant}"
     model = onnx.load(load_path)
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
index 07e718ff..21aa3fb0 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
@@ -66,7 +66,7 @@ def test_segformer_imgcls_onnx_1(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/segformer/files/onnx/imgcls/"
         + str(variant).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
index 5e8521a0..e3e0e0f2 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_2.py
@@ -50,7 +50,7 @@ def test_segformer_imgcls_onnx_2(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/segformer/files/onnx/imgcls/"
         + str(variant).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
index d0ca4f98..8994f017 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_1.py
@@ -75,7 +75,7 @@ def test_segformer_semseg_onnx_1(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/segformer/files/onnx/semseg/"
         + str(variant).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
index ee4b5914..daa6a485 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_seg_2.py
@@ -66,7 +66,7 @@ def test_segformer_semseg_onnx_2(test_device, variant):
     pixel_values = get_sample_data(variant)
 
     onnx_model_path = (
-        "third_party/confidential_customer_models/generated/files/"
+        "third_party/confidential_customer_models/internal/segformer/files/onnx/semseg/"
         + str(variant).split("/")[-1].replace("-", "_")
         + ".onnx"
     )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
index 1d45533b..ea96f815 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -79,7 +79,7 @@ def test_yolo_v5_320x320_onnx(test_device, variant):
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
 
     # Load the ONNX model
-    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model_path = f"third_party/confidential_customer_models/internal/yolo_v5/files/onnx/{variant}_{input_size}.onnx"
     onnx_model = onnx.load(onnx_model_path)
     model_name = f"{variant}_{input_size}_onnx"
 
@@ -185,7 +185,7 @@ def test_yolo_v5_480x480_onnx(test_device, variant):
     input_size = 480
 
     # Load the ONNX model
-    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model_path = f"third_party/confidential_customer_models/internal/yolo_v5/files/onnx/{variant}_{input_size}.onnx"
     onnx_model = onnx.load(onnx_model_path)
     model_name = f"{variant}_{input_size}_onnx"
 
@@ -294,7 +294,7 @@ def test_yolo_v5_640x640_onnx(test_device, variant):
     input_size = 640
 
     # Load the ONNX model
-    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model_path = f"third_party/confidential_customer_models/internal/yolo_v5/files/onnx/{variant}_{input_size}.onnx"
     onnx_model = onnx.load(onnx_model_path)
     model_name = f"{variant}_{input_size}_onnx"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
index 08e6f4ea..c30dd092 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -206,7 +206,7 @@ def test_yolox_onnx(variant, test_device):
     img_tensor = img_tensor.unsqueeze(0)
 
     # Load and validate the ONNX model
-    onnx_model_path = f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+    onnx_model_path = f"third_party/confidential_customer_models/internal/yolox/files/onnx/{variant}.onnx"
     onnx_model = onnx.load(onnx_model_path)
     onnx.checker.check_model(onnx_model)
     model_name = f"onnx_{variant}"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
index 4d0470ef..8309c288 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -13,11 +13,11 @@
 import sys
 from pybuda._C.backend_api import BackendDevice
 
-sys.path.append("third_party/confidential_customer_models/generated/scripts/")
+sys.path.append("third_party/confidential_customer_models/internal/ddrnet/scripts/image_classification")
 from model_ddrnet import DualResNet_23, DualResNet_39, BasicBlock
 
 sys.path.append(
-    "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/model"
+    "third_party/confidential_customer_models/internal/ddrnet/scripts/semantic_segmentation"
 )
 from semseg import DualResNet, BasicBlock_seg
 
@@ -53,7 +53,7 @@ def test_ddrnet_pytorch(variant, test_device):
         )
 
     state_dict_path = (
-        f"third_party/confidential_customer_models/generated/files/{variant}.pth"
+        f"third_party/confidential_customer_models/internal/ddrnet/files/weights/{variant}.pth"
     )
 
     state_dict = torch.load(state_dict_path, map_location=torch.device("cpu"))
@@ -143,7 +143,7 @@ def test_ddrnet_semantic_segmentation_pytorch(variant, test_device):
             augment=True,
         )
 
-    state_dict_path = f"third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/weights/{variant}.pth"
+    state_dict_path = f"third_party/confidential_customer_models/internal/ddrnet/files/weights/{variant}.pth"
     state_dict = torch.load(state_dict_path, map_location=torch.device("cpu"))
     model.load_state_dict(state_dict, strict=False)
     model.eval()
@@ -151,7 +151,7 @@ def test_ddrnet_semantic_segmentation_pytorch(variant, test_device):
     tt_model = pybuda.PyTorchModule(model_name, model)
 
     # prepare input
-    image_path = "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/image/road_scenes.png"
+    image_path = "third_party/confidential_customer_models/internal/ddrnet/files/samples/road_scenes.png"
     input_image = Image.open(image_path)
     input_tensor = transforms.ToTensor()(input_image)
     input_batch = input_tensor.unsqueeze(0)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
index de08e00e..0ed81cb7 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
@@ -36,7 +36,7 @@ def test_hardnet_pytorch(test_device, variant):
 
     # load the weights downloaded from https://github.com/PingoLH/Pytorch-HarDNet
     checkpoint_path = (
-        f"third_party/confidential_customer_models/generated/files/{variant}.pth"
+        f"third_party/confidential_customer_models/internal/hardnet/files/weights/{variant}.pth"
     )
 
     # Load weights from the checkpoint file and maps tensors to CPU, ensuring compatibility even without a GPU.
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
index 092eadc4..6f891074 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
@@ -11,7 +11,7 @@
 
 from pybuda import VerifyConfig
 import sys
-sys.path.append("third_party/confidential_customer_models/cv_demos/pidnet/model")
+sys.path.append("third_party/confidential_customer_models/internal/pidnet/scripts")
 from model_pidnet import update_model_config, get_seg_model
 
 
@@ -29,7 +29,7 @@ def test_pidnet_pytorch(variant, test_device):
     os.environ["PYBUDA_RIBBON2"] = "1"
 
     # Load and pre-process image
-    image_path = "./third_party/confidential_customer_models/cv_demos/pidnet/image/road_scenes.png"
+    image_path = "third_party/confidential_customer_models/internal/pidnet/files/samples/road_scenes.png"
     image = cv2.imread(image_path, cv2.IMREAD_COLOR)
     image = image.astype(np.float32)[:, :, ::-1]
     image = image / 255.0
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
index ca4621fa..e8485e8f 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -63,7 +63,7 @@ def test_tri_basic_2_sematic_segmentation_pytorch(test_device):
     image_w = 800
     image_h = 800
     image = cv2.imread(
-        "third_party/confidential_customer_models/cv_demos/tri_basic_2/images/left.png"
+        "third_party/confidential_customer_models/internal/tri_basic_2/files/samples/left.png"
     )
     image = cv2.resize(image, (image_w, image_h), interpolation=cv2.INTER_LINEAR)
     image_tensor = (
@@ -74,7 +74,7 @@ def test_tri_basic_2_sematic_segmentation_pytorch(test_device):
     hparams = SimpleNamespace(num_classes=24)
     model = resnet34_semseg(hparams)
     state_dict = torch.load(
-        "third_party/confidential_customer_models/cv_demos/tri_basic_2/weights/basic_semseg.ckpt",
+        "third_party/confidential_customer_models/internal/tri_basic_2/files/weights/basic_semseg.ckpt",
         map_location="cpu",
     )
     model.load_state_dict(state_dict)

From 3a14d7538896d39ef29e5b7a70bb7e80578eb4b5 Mon Sep 17 00:00:00 2001
From: Deepak Sudhakar <dsudhakar@tenstorrent.com>
Date: Wed, 10 Jul 2024 09:08:38 +0000
Subject: [PATCH 024/116] Remove nlp onnx models in push pipeline

(cherry picked from commit cbcdd452d79007802e86ed299e5ca82c1ba84a00)
---
 .../tvm/nlp/onnx/tests_A/test_unispeech.py    |  71 -----
 .../test/tvm/nlp/onnx/tests_A/test_wav2vec.py |  68 -----
 pybuda/test/tvm/nlp/onnx/tests_B/test_bert.py |  38 ---
 pybuda/test/tvm/nlp/onnx/tests_B/test_detr.py | 162 ----------
 pybuda/test/tvm/nlp/onnx/tests_B/test_gptj.py |  67 -----
 .../test/tvm/nlp/onnx/tests_C/test_gptneo.py  |  66 -----
 .../test/tvm/nlp/onnx/tests_C/test_nbeats.py  | 279 ------------------
 .../tvm/nlp/onnx/tests_C/test_squeeze_bert.py | 134 ---------
 pybuda/test/tvm/nlp/onnx/tests_C/test_xlm.py  |  74 -----
 9 files changed, 959 deletions(-)
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_A/test_unispeech.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_A/test_wav2vec.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_B/test_detr.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_C/test_gptneo.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_C/test_nbeats.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_C/test_squeeze_bert.py
 delete mode 100644 pybuda/test/tvm/nlp/onnx/tests_C/test_xlm.py

diff --git a/pybuda/test/tvm/nlp/onnx/tests_A/test_unispeech.py b/pybuda/test/tvm/nlp/onnx/tests_A/test_unispeech.py
deleted file mode 100644
index 3761f206..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_A/test_unispeech.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# UniSpeech basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-from transformers import UniSpeechModel
-
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.config import CompileDepth
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import os
-import onnx
-
-
-def test_unispeech(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    else:
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER  # Unsupported HW ops
-
-    framework_model = UniSpeechModel.from_pretrained(
-        "microsoft/unispeech-sat-base", torchscript=True
-    )
-    input_shape = (1, 512)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/unispeech_onnx.onnx"
-
-    torch.onnx.export(framework_model,               # model being run
-                        torch.randn(input_shape), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=12,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "unispeech_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-    os.remove(save_path)
diff --git a/pybuda/test/tvm/nlp/onnx/tests_A/test_wav2vec.py b/pybuda/test/tvm/nlp/onnx/tests_A/test_wav2vec.py
deleted file mode 100644
index b2c9bac6..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_A/test_wav2vec.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-from transformers import Wav2Vec2Model
-
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import os
-import onnx
-
-def test_wav2vec2(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        # Unsupported concatenate backward
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    else:
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER  
-
-    framework_model = Wav2Vec2Model.from_pretrained(
-        "facebook/wav2vec2-base", torchscript=True
-    )
-
-    input_shape = (1, 512)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/wav2vec_onnx.onnx"
-
-    torch.onnx.export(framework_model,               # model being run
-                        torch.randn(input_shape), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=12,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "wav2vec_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-    os.remove(save_path)
\ No newline at end of file
diff --git a/pybuda/test/tvm/nlp/onnx/tests_B/test_bert.py b/pybuda/test/tvm/nlp/onnx/tests_B/test_bert.py
index 68b75c20..c5bb5c09 100644
--- a/pybuda/test/tvm/nlp/onnx/tests_B/test_bert.py
+++ b/pybuda/test/tvm/nlp/onnx/tests_B/test_bert.py
@@ -25,45 +25,7 @@
 import urllib
 import os
 
-@pytest.mark.skip(reason="Pretrained Onnx Bert casts int to float in TVM.")
-def test_tvm_bert_squad_onnx(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()
-        test_device.devtype = BackendType.NoBackend
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "bert_squad.onnx"
-
-    if not os.path.exists(save_path):
-        urllib.request.urlretrieve(
-            "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-12.onnx",
-            save_path,
-        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "bert_squad_onnx",
-        onnx_model,
-        save_path,
-    )
 
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    
-    input_shape = (1, 32)
-    verify_module(
-        mod,
-        ((256,),(1, 256,),(1,256,),(1, 256,),),
-        input_params=[{"data_format" :  torch.int64},{"data_format" :  torch.int64},{"data_format" :  torch.int64},{"data_format" :  torch.int64},],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-    os.remove(save_path)
 
 
 from transformers import BertModel, BertConfig, BertForPreTraining
diff --git a/pybuda/test/tvm/nlp/onnx/tests_B/test_detr.py b/pybuda/test/tvm/nlp/onnx/tests_B/test_detr.py
deleted file mode 100644
index dfb9ecb1..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_B/test_detr.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# DETR basic bring-up tests of tracing functionality
-#
-import os
-import pytest
-
-import onnx
-import torch
-from transformers import DetrModel
-
-from pybuda import (
-    OnnxModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.config import CompileDepth
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_detr_encoder_layer(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class DeTrEncoderWrapper(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.layer = module.encoder.layers[0]
-            self.attn_mask = torch.rand((1, 1, 256, 256))
-            self.pos_emb = torch.rand((1, 256))
-
-        def forward(self, hidden_states):
-            return self.layer(hidden_states, self.attn_mask, self.pos_emb)
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW op: heaviside
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    # Configure PyTorch module
-    pytorch_module = DetrModel.from_pretrained(
-        "facebook/detr-resnet-50", torchscript=True
-    )
-    pytorch_module = DeTrEncoderWrapper(pytorch_module)
-
-    # Export to ONNX
-    input_shape = (1, 256, 256)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/detr_encoder_layer.onnx"
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "detr_encoder_layer_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors=("layer.self_attn.k_proj.bias"),
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
-
-
-def test_detr_decoder_layer(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class DeTrDecoderWrapper(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.layer = module.decoder.layers[0]
-            self.attn_mask = torch.rand((1, 1, 256, 256))
-            self.pos_emb = torch.rand((1, 256))
-            self.kv_state = torch.rand((1, 1, 256, 256))
-
-        def forward(self, hidden_states):
-            return self.layer(
-                hidden_states, self.attn_mask, self.pos_emb, self.kv_state
-            )
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW op: heaviside
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    # Configure PyTorch module
-    pytorch_module = DetrModel.from_pretrained(
-        "facebook/detr-resnet-50", torchscript=True
-    )
-    pytorch_module = DeTrDecoderWrapper(pytorch_module)
-
-    # Export to ONNX
-    input_shape = (1, 256, 256)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/detr_decoder_layer.onnx"
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "detr_decoder_layer_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors=("layer.self_attn.k_proj.bias"),
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
diff --git a/pybuda/test/tvm/nlp/onnx/tests_B/test_gptj.py b/pybuda/test/tvm/nlp/onnx/tests_B/test_gptj.py
index 224a71e7..4dd2061e 100644
--- a/pybuda/test/tvm/nlp/onnx/tests_B/test_gptj.py
+++ b/pybuda/test/tvm/nlp/onnx/tests_B/test_gptj.py
@@ -25,74 +25,7 @@
 from pybuda.verify.config import TestKind
 
 
-def test_gptj_block(test_kind, test_device):
-    # unspported op Gather with the new environment
-    # tenstorrent/pybuda#1610
-    pytest.skip()
 
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.retain_tvm_python_files = True
-    if not test_kind.is_training():
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        # Unsupported concatenate backward pass
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    # Configure PyTorch module
-    config = GPTJConfig(n_layer=1)  # for faster loading
-    config.rotary_dim = 64
-    pytorch_module = GPTJBlock(config)
-
-    # Export to ONNX
-    input_shape = (1, 128, 4096)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/gptj_block.onnx"
-
-    # Add position_ids to args
-    position_ids = torch.arange(0, input_shape[-2], dtype=torch.long, device='cpu')   
-    position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-2])
-    torch.onnx.export(
-        pytorch_module,
-        args=(torch.rand(input_shape), {"position_ids": position_ids,}),
-        f=save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    pybuda_onnx_module = OnnxModule(
-        "gptj_block_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    input_shape = []
-    for i in range(len(onnx_module.graph.input)):
-        dimension = onnx_module.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-
-    verify_module(
-        pybuda_onnx_module,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
 
 
 def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
diff --git a/pybuda/test/tvm/nlp/onnx/tests_C/test_gptneo.py b/pybuda/test/tvm/nlp/onnx/tests_C/test_gptneo.py
deleted file mode 100644
index 3baae700..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_C/test_gptneo.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-from transformers import GPTNeoModel, GPTNeoConfig
-
-from pybuda import (
-    PyTorchModule,
-    CompileDepth,
-    VerifyConfig,
-    OnnxModule,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import onnx
-import os
-
-
-def test_gptneo_onnx(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    torch.manual_seed(52)
-    input_shape = (1, 64, 2560)
-    config = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B", torchscript=True)
-    config.num_layers = 1  # For faster model loading
-    model = GPTNeoModel(config)
-    submodel = model.h[0]
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/gptneo_onnx.onnx"
-    traced_model = torch.jit.trace(submodel, tuple([torch.randn(input_shape),]), strict=False)
-    torch.onnx.export(traced_model,               # model being run
-                        torch.randn(input_shape), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=10,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "gptneo_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-    os.remove(save_path)
-
diff --git a/pybuda/test/tvm/nlp/onnx/tests_C/test_nbeats.py b/pybuda/test/tvm/nlp/onnx/tests_C/test_nbeats.py
deleted file mode 100644
index d706a721..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_C/test_nbeats.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-
-import onnx
-import torch
-import pytest
-from pytorch_forecasting.models.nbeats.sub_modules import (
-    NBEATSBlock,
-    NBEATSGenericBlock,
-    NBEATSTrendBlock,
-    NBEATSSeasonalBlock,
-)
-
-from pybuda import (
-    OnnxModule,
-    BackendType,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import CompileDepth
-
-
-def test_tvm_nbeats_block(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    if not test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    # Configure PyTorch module
-    input_shape = (1, 64, 64, 64)
-    pytorch_module = NBEATSBlock(100, 100, backcast_length=input_shape[-1])
-
-    # Export to ONNX
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/nbeats_block.onnx"
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "nbeats_block_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
-
-
-def test_tvm_nbeats_generic_block(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    if not test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    # Configure PyTorch module
-    input_shape = (1, 64, 64, 64)
-    pytorch_module = NBEATSGenericBlock(100, 100, backcast_length=input_shape[-1])
-
-    # Export to ONNX
-    save_path = (
-        os.path.dirname(os.path.realpath(__file__)) + "/nbeats_generic_block.onnx"
-    )
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "nbeats_generic_block_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
-
-
-def test_tvm_nbeats_seasonal_block(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class NBeatsSeasonal(NBEATSSeasonalBlock):
-        def __init__(self, units, thetas_dim, backcast_length):
-            super().__init__(
-                units,
-                thetas_dim=thetas_dim,
-                backcast_length=backcast_length,
-            )
-
-        def forward(self, x):
-            x = super(NBEATSSeasonalBlock, self).forward(x)
-            amplitudes_backward = self.theta_b_fc(x)
-            backcast = amplitudes_backward.matmul(self.S_backcast)
-            amplitudes_forward = self.theta_f_fc(x)
-            forecast = amplitudes_forward.matmul(self.S_forecast)
-
-            return backcast, forecast
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    if not test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    # Configure PyTorch module
-    input_shape = (1, 64, 64, 64)
-    pytorch_module = NBeatsSeasonal(100, 100, backcast_length=input_shape[-1])
-
-    # Export to ONNX
-    save_path = (
-        os.path.dirname(os.path.realpath(__file__)) + "/nbeats_seasonal_block.onnx"
-    )
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "nbeats_seasonal_block_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
-
-
-def test_tvm_nbeats_trend_block(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class NBeatsTrend(NBEATSTrendBlock):
-        def __init__(self, units, thetas_dim, backcast_length):
-            super().__init__(
-                units,
-                thetas_dim=thetas_dim,
-                backcast_length=backcast_length,
-            )
-
-        def forward(self, x):
-            x = super(NBEATSTrendBlock, self).forward(x)
-            backcast = self.theta_b_fc(x).matmul(self.T_backcast)
-            forecast = self.theta_f_fc(x).matmul(self.T_forecast)
-            return backcast, forecast
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    if not test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    # Configure PyTorch module
-    input_shape = (1, 64, 64, 64)
-    pytorch_module = NBeatsTrend(100, 100, backcast_length=input_shape[-1])
-
-    # Export to ONNX
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/nbeats_trend_block.onnx"
-    torch.onnx.export(
-        pytorch_module,
-        torch.rand(input_shape),
-        save_path,
-        export_params=True,
-        opset_version=14,
-        do_constant_folding=True,
-        input_names=["input"],
-        output_names=["output"],
-    )
-
-    # Load ONNX module
-    onnx_module = onnx.load(save_path)
-    onnx.checker.check_model(onnx_module)
-    onnx_module = OnnxModule(
-        "nbeats_trend_block_onnx",
-        onnx_module,
-        save_path,
-    )
-
-    verify_module(
-        onnx_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-    # Cleanup
-    os.remove(save_path)
diff --git a/pybuda/test/tvm/nlp/onnx/tests_C/test_squeeze_bert.py b/pybuda/test/tvm/nlp/onnx/tests_C/test_squeeze_bert.py
deleted file mode 100644
index 3eba76f4..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_C/test_squeeze_bert.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-
-import torch
-import torch.nn as nn
-# from transformers.models.squeezebert import SqueezeBertEncoder
-from transformers import SqueezeBertModel, SqueezeBertConfig
-
-import math
-import itertools
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    OnnxModule,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import os
-import onnx
-import onnxruntime as ort
-
-def test_tvm_SqueezeBertEncoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER 
-
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    input_shape = (1, 32, 768)
-
-    config = SqueezeBertConfig()
-    config.num_hidden_layers = 1
-    model = SqueezeBertModel(config)
-
-    attention_mask = torch.ones(input_shape[0:2])
-    extended_attn_mask = model.get_extended_attention_mask(attention_mask, input_shape[0:2], "cpu")
-
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/SqueezeBertEncoder_onnx.onnx"
-
-    torch.onnx.export(model.encoder,               # model being run
-                        tuple([torch.randn(input_shape), extended_attn_mask]), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=10,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "SqueezeBertEncoder_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,extended_attn_mask.shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-    os.remove(save_path)
-
-
-
-
-def test_tvm_SqueezeBertPooler(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER 
-
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    config = SqueezeBertConfig()
-
-    model = SqueezeBertModel(config)
-
-    input_shape = (1, 8, 768)
-
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/SqueezeBertPooler_onnx.onnx"
-
-    traced_model = torch.jit.trace(model.pooler, tuple([torch.randn(input_shape),]), strict=False)
-
-    torch.onnx.export(traced_model,               # model being run
-                        tuple([torch.randn(input_shape),]), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=10,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "SqueezeBertPooler_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        )
-    )
-    os.remove(save_path)
diff --git a/pybuda/test/tvm/nlp/onnx/tests_C/test_xlm.py b/pybuda/test/tvm/nlp/onnx/tests_C/test_xlm.py
deleted file mode 100644
index 18e93bf2..00000000
--- a/pybuda/test/tvm/nlp/onnx/tests_C/test_xlm.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-
-import torch
-import torch.nn as nn
-from transformers.models.xlm import XLMConfig, XLMModel, XLMPreTrainedModel
-
-import math
-import itertools
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-    OnnxModule,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-import os
-import onnx
-
-def test_tvm_xlm_FFN(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    config = XLMConfig()
-
-    model = XLMModel(config)
-
-    input_shape = (1, 16, 2048)
-    save_path = os.path.dirname(os.path.realpath(__file__)) + "/xlm_FFN_onnx.onnx"
-
-    torch.onnx.export(model.ffns[0],               # model being run
-                        torch.rand(input_shape), # model input (or a tuple for multiple inputs),
-                        save_path,   # where to save the model (can be a file or file-like object)
-                        export_params=True,        # store the trained parameter weights inside the model file
-                        opset_version=10,          # the ONNX version to export the model to
-                        do_constant_folding=True,  # whether to execute constant folding for optimization
-                        input_names = ['input'],   # the model's input names
-                        output_names = ['output'], # the model's output names
-                        )
-
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    mod = OnnxModule(
-        "xlm_FFN_onnx",
-        onnx_model,
-        save_path,
-    )
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        )
-    )
-    os.remove(save_path)

From d86581bd9c61b990acdd08517be178c5c9fd0c46 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <nvukobrat@tenstorrent.com>
Date: Thu, 11 Jul 2024 12:42:35 +0000
Subject: [PATCH 025/116] MNIST overfit, PyTorch vs PyBuda

(cherry picked from commit 63595fc7f18c6f2e2a63b1f3592d4454851f0d14)
---
 pybuda/test/training/mnist/__init__.py        |  0
 .../training/mnist/mnist_pybuda_overfit.py    | 74 +++++++++++++++++++
 .../training/mnist/mnist_pytorch_overfit.py   | 57 ++++++++++++++
 pybuda/test/training/mnist/utils.py           | 62 ++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 pybuda/test/training/mnist/__init__.py
 create mode 100644 pybuda/test/training/mnist/mnist_pybuda_overfit.py
 create mode 100644 pybuda/test/training/mnist/mnist_pytorch_overfit.py
 create mode 100644 pybuda/test/training/mnist/utils.py

diff --git a/pybuda/test/training/mnist/__init__.py b/pybuda/test/training/mnist/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pybuda/test/training/mnist/mnist_pybuda_overfit.py b/pybuda/test/training/mnist/mnist_pybuda_overfit.py
new file mode 100644
index 00000000..9ae296d8
--- /dev/null
+++ b/pybuda/test/training/mnist/mnist_pybuda_overfit.py
@@ -0,0 +1,74 @@
+import torch
+from torch import nn
+
+import pybuda
+from pybuda import (
+    CPUDevice,
+    PyTorchModule,
+)
+from utils import (
+    MNISTLinear,
+    Identity,
+    load_tb_writer,
+    load_dataset,
+)
+
+def main():
+    torch.manual_seed(0)
+
+    # Config
+    num_steps = 200
+    batch_size = 1
+    learning_rate = 0.01
+    sequential = True
+
+    # Load dataset
+    test_loader, train_loader = load_dataset(batch_size)
+
+    # Load TensorBoard writer
+    writer = load_tb_writer()
+
+    # Dataset sample input
+    sample_input = (test_loader.dataset[0][0].repeat(batch_size, 1),)
+    sample_target = (
+        nn.functional.one_hot(torch.tensor(test_loader.dataset[0][1]), num_classes=10)
+        .float()
+        .repeat(batch_size, 1)
+    )
+
+    # Initialize model
+    framework_model = MNISTLinear()
+    tt_model = pybuda.PyTorchModule("mnist_linear", framework_model)
+    
+    tt_optimizer = pybuda.optimizers.SGD(
+        learning_rate=learning_rate, device_params=True
+    )
+    tt0 = pybuda.TTDevice("tt0", module=tt_model, optimizer=tt_optimizer)
+
+    cpu0 = CPUDevice("cpu0", module=PyTorchModule("identity", Identity()))
+    cpu0.place_loss_module(pybuda.PyTorchModule("l1_loss", torch.nn.L1Loss()))
+
+    checkpoint_queue = pybuda.initialize_pipeline(
+        training=True,
+        sample_inputs=sample_input,
+        sample_targets=sample_target,
+        _sequential=sequential,
+    )
+
+    loss_q = pybuda.run.get_loss_queue()
+
+    for step in range(num_steps):
+        tt0.push_to_inputs(sample_input)
+        cpu0.push_to_target_inputs(sample_target)
+
+        pybuda.run_forward(input_count=1, _sequential=sequential)
+        pybuda.run_backward(input_count=1, zero_grad=True, _sequential=sequential)
+        pybuda.run_optimizer(checkpoint=True, _sequential=sequential)
+
+    step = 0
+    while not loss_q.empty():
+        writer.add_scalar("Loss/PyBuda/overfit", loss_q.get()[0], step)
+        step += 1
+        
+if __name__ == "__main__":
+    main()
diff --git a/pybuda/test/training/mnist/mnist_pytorch_overfit.py b/pybuda/test/training/mnist/mnist_pytorch_overfit.py
new file mode 100644
index 00000000..4922942d
--- /dev/null
+++ b/pybuda/test/training/mnist/mnist_pytorch_overfit.py
@@ -0,0 +1,57 @@
+import torch
+from torch import nn
+
+from utils import (
+    MNISTLinear,
+    Identity,
+    load_tb_writer,
+    load_dataset,
+)
+
+
+def main():
+    torch.manual_seed(0)
+
+    # Training configurations
+    num_steps = 200
+    batch_size = 1
+    learning_rate = 0.01
+
+    # Load dataset
+    test_loader, train_loader = load_dataset(batch_size)
+
+    # Load TensorBoard writer
+    writer = load_tb_writer()
+
+    # Dataset sample input
+    sample_input = test_loader.dataset[0][0].repeat(batch_size, 1)
+    sample_target = (
+        nn.functional.one_hot(torch.tensor(test_loader.dataset[0][1]), num_classes=10)
+        .float()
+        .repeat(batch_size, 1)
+    )
+
+    # Initialize model
+    framework_model = MNISTLinear()
+
+    # Initialize optimizer and loos function
+    optimizer = torch.optim.SGD(framework_model.parameters(), lr=learning_rate)
+    loss_fn = torch.nn.L1Loss()
+
+    # Training loop
+    for step in range(num_steps):
+        optimizer.zero_grad()
+
+        outputs = framework_model(sample_input)
+
+        loss = loss_fn(outputs, sample_target)
+        loss.backward()
+
+        optimizer.step()
+
+        # Log loss
+        writer.add_scalar("Loss/PyTorch/overfit", loss.item(), step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pybuda/test/training/mnist/utils.py b/pybuda/test/training/mnist/utils.py
new file mode 100644
index 00000000..58f0c5fb
--- /dev/null
+++ b/pybuda/test/training/mnist/utils.py
@@ -0,0 +1,62 @@
+from datetime import datetime
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter
+from torchvision.datasets import MNIST as mnist_dataset
+
+# Model definition
+class MNISTLinear(nn.Module):
+    def __init__(self, input_size=784, output_size=10, hidden_size=256):
+        super(MNISTLinear, self).__init__()
+        self.l1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.l2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x = self.l1(x)
+        x = self.relu(x)
+        x = self.l2(x)
+
+        return nn.functional.softmax(x)
+
+# Identify function. Simply propagate activations. Used for attaching loss function onto CPU device
+class Identity(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, act):
+        return act + 0
+
+def load_tb_writer():
+    """
+    Load TensorBoard writer for logging
+    """
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    log_dir = f"runs/gradient_visualization/{current_time}/"
+    writer = SummaryWriter(log_dir)
+
+    return writer
+
+
+def load_dataset(batch_size):
+    """
+    Load and normalize MNIST dataset
+    """
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,)),  # Mean and std for MNIST
+            transforms.Lambda(lambda x: x.view(-1)),  # Flatten image
+        ]
+    )
+
+    train_dataset = mnist_dataset(root="./data", train=True, download=True, transform=transform)
+    test_dataset = mnist_dataset(root="./data", train=False, download=True, transform=transform)
+
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)
+
+    return test_loader, train_loader

From 357f3b47aec5d83fe4f9eabdfda343e26e896e30 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 11 Jul 2024 09:58:15 +0000
Subject: [PATCH 026/116] Add yaml configurations for yolox-n,t,s,m(e300,e150)
 demo script -pytorch

(cherry picked from commit 5ff954d4f5d6ed15aef5899f947634cdc0b18eae)
---
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
index 8c615809..d29249f9 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
@@ -226,7 +226,7 @@ def test_yolox_pytorch(variant, test_device):
     elif test_device.arch == BackendDevice.Grayskull:
         if variant in ["yolox_nano", "yolox_s", "yolox_l", "yolox_x"]:
             pcc_value = 0.93
-        elif variant in ["yolox_m,yolox_darknet"]:
+        elif variant in ["yolox_m","yolox_darknet"]:
             pcc_value = 0.92
         elif variant == "yolox_tiny":
             pcc_value = 0.98

From da467fe760f710739425eeb39cdda93b89c09dfa Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <nvukobrat@tenstorrent.com>
Date: Fri, 12 Jul 2024 11:54:10 +0000
Subject: [PATCH 027/116] MNIST Training: Support for loss on TT device

(cherry picked from commit 12e342273230b6a76d98b37247de5c2baa0ef43c)
---
 .../training/mnist/mnist_pybuda_overfit.py    | 48 ++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/pybuda/test/training/mnist/mnist_pybuda_overfit.py b/pybuda/test/training/mnist/mnist_pybuda_overfit.py
index 9ae296d8..fe6a04c9 100644
--- a/pybuda/test/training/mnist/mnist_pybuda_overfit.py
+++ b/pybuda/test/training/mnist/mnist_pybuda_overfit.py
@@ -1,4 +1,5 @@
 import torch
+import pytest
 from torch import nn
 
 import pybuda
@@ -6,18 +7,20 @@
     CPUDevice,
     PyTorchModule,
 )
-from utils import (
+from .utils import (
     MNISTLinear,
     Identity,
     load_tb_writer,
     load_dataset,
 )
+from pybuda.config import _get_global_compiler_config
 
-def main():
+
+def main(loss_on_cpu=True):
     torch.manual_seed(0)
 
     # Config
-    num_steps = 200
+    num_steps = 250
     batch_size = 1
     learning_rate = 0.01
     sequential = True
@@ -39,14 +42,24 @@ def main():
     # Initialize model
     framework_model = MNISTLinear()
     tt_model = pybuda.PyTorchModule("mnist_linear", framework_model)
-    
+
     tt_optimizer = pybuda.optimizers.SGD(
         learning_rate=learning_rate, device_params=True
     )
     tt0 = pybuda.TTDevice("tt0", module=tt_model, optimizer=tt_optimizer)
 
-    cpu0 = CPUDevice("cpu0", module=PyTorchModule("identity", Identity()))
-    cpu0.place_loss_module(pybuda.PyTorchModule("l1_loss", torch.nn.L1Loss()))
+    if loss_on_cpu:
+        cpu0 = CPUDevice("cpu0", module=PyTorchModule("identity", Identity()))
+        cpu0.place_loss_module(pybuda.PyTorchModule("l1_loss", torch.nn.MSELoss()))
+    else:
+        tt_loss = pybuda.PyTorchModule("l1_loss", torch.nn.MSELoss())
+        tt0.place_loss_module(tt_loss)
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.enable_auto_fusing = False
+
+    if not loss_on_cpu:
+        sample_target = (sample_target,)
 
     checkpoint_queue = pybuda.initialize_pipeline(
         training=True,
@@ -55,20 +68,33 @@ def main():
         _sequential=sequential,
     )
 
-    loss_q = pybuda.run.get_loss_queue()
-
     for step in range(num_steps):
         tt0.push_to_inputs(sample_input)
-        cpu0.push_to_target_inputs(sample_target)
+        if loss_on_cpu:
+            cpu0.push_to_target_inputs(sample_target)
+        else:
+            tt0.push_to_target_inputs(sample_target)
 
         pybuda.run_forward(input_count=1, _sequential=sequential)
         pybuda.run_backward(input_count=1, zero_grad=True, _sequential=sequential)
         pybuda.run_optimizer(checkpoint=True, _sequential=sequential)
 
+    loss_q = pybuda.run.get_loss_queue()
+
     step = 0
     while not loss_q.empty():
-        writer.add_scalar("Loss/PyBuda/overfit", loss_q.get()[0], step)
+        if loss_on_cpu:
+            writer.add_scalar("Loss/PyBuda/overfit", loss_q.get()[0], step)
+        else:
+            writer.add_scalar("Loss/PyBuda/overfit", loss_q.get()[0].value()[0], step)
         step += 1
-        
+
+
+loss_on_cpu = [True, False]
+@pytest.mark.parametrize("loss_on_cpu", loss_on_cpu, ids=loss_on_cpu)
+def test_mnist_pybuda_overfit(loss_on_cpu):
+    main(loss_on_cpu)
+
+
 if __name__ == "__main__":
     main()

From 280c15fed5ae16ef3bdd6a6065d9415ea21d38aa Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 3 Jul 2024 12:06:18 +0000
Subject: [PATCH 028/116] Use property for datatypes

Issue #2755

(cherry picked from commit 1a7abd1ba81e63b21b2848acef6ff3b43bdc457d)
---
 pybuda/pybuda/op_repo/datatypes.py              |  2 ++
 pybuda/test/random/rgg/algorithms.py            | 17 +++++++++--------
 pybuda/test/random/rgg/datatypes.py             | 11 ++++++++---
 .../random/rgg/pybuda/generated_model.jinja2    | 10 +++++-----
 .../random/rgg/pytorch/generated_model.jinja2   |  8 ++++----
 pybuda/test/random/rgg/utils.py                 |  2 +-
 6 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/pybuda/pybuda/op_repo/datatypes.py b/pybuda/pybuda/op_repo/datatypes.py
index 9f7fd184..4822924e 100644
--- a/pybuda/pybuda/op_repo/datatypes.py
+++ b/pybuda/pybuda/op_repo/datatypes.py
@@ -36,9 +36,11 @@ class OperatorDefinition:
     operands: List[str] = field(default_factory=list)  # TODO describe operand and shapes
     calc_input_shapes: Optional[Callable[["OperatorDefinition", TensorShape, Random], List[TensorShape]]] = None  # calculate input shapes from output shape
 
+    @property
     def is_operator(self) -> bool:
         return not self.instantiate
 
+    @property
     def is_layer(self) -> bool:
         return self.instantiate
 
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 52354f6a..931d78b5 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -67,7 +67,7 @@ def init_nodes(cls, test_context: RandomizerTestContext):
             for input_node in node.inputs:
                 if (input_node is not None and not NodeUtils.is_previous_node(node, input_node)) or cls.always_unique_variables:
                     # overriding default output variable name
-                    input_node.out_value = input_node.operator_name()
+                    input_node.out_value = input_node.operator_name
                     logger.trace(f"Set out_value = {input_node.out_value}")
 
         logger.trace("Setting input nodes for open nodes")
@@ -81,6 +81,7 @@ def init_nodes(cls, test_context: RandomizerTestContext):
             used_input_nodes: List[RandomizerInputNode] = []
             for open_input_index in NodeUtils.get_open_input_indices(node):
                 input_shape = input_shapes[open_input_index]
+
                 # list of all graph input nodes with the same shape as the input shape
                 input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shape]
                 # list of input nodes with the same shape that are not already connected to the node
@@ -97,12 +98,12 @@ def init_nodes(cls, test_context: RandomizerTestContext):
 
                     if allow_repeat:
                         if not same_inputs_rate_limitter.is_allowed():
-                            logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.get_name()}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                            logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
                             allow_repeat = False
 
                     if allow_repeat:
                         input_node = rng_shape.choice(input_nodes_with_same_shape)
-                        logger.trace(f"Allowed same input value {input_node.out_value} -> {node.get_name()}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                        logger.trace(f"Allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
                     
                     else:
                         # create a new input node with the same shape since there are no unused input nodes with the same shape or repeat is not allowed
@@ -140,12 +141,12 @@ def validate_graph(cls, graph: RandomizerGraph):
         for node in nodes:
             if node.operator.input_num and node.operator.input_num > 1:
                 if NodeUtils.num_of_open_inputs(node) > 0:
-                    raise Exception(f"Closed {NodeUtils.num_of_closed_inputs(node)}/{node.operator.input_num} inputs, missing {NodeUtils.num_of_open_inputs(node)} inputs for node {node.node_info()}")
+                    raise Exception(f"Closed {NodeUtils.num_of_closed_inputs(node)}/{node.operator.input_num} inputs, missing {NodeUtils.num_of_open_inputs(node)} inputs for node {node.node_info}")
 
         # Validation of operator and layer types
         for node in nodes:
             if node.operator and not isinstance(node.operator, OperatorDefinition):
-                raise Exception(f"Step operator is wrong type {node.node_info()} expected RandomizerOperator got {type(node.operator)}")
+                raise Exception(f"Step operator is wrong type {node.node_info} expected RandomizerOperator got {type(node.operator)}")
 
     @classmethod
     def prepare_graph(cls, test_context: RandomizerTestContext):
@@ -266,7 +267,7 @@ def build_graph(self, test_context: RandomizerTestContext):
 
                 if len(random_nodes) > 1:
                     for random_node in random_nodes[1:]:
-                        logger.trace(f"Constructing new fork join from operator op{node_index} {op1.name} -> {random_node.get_name()}")
+                        logger.trace(f"Constructing new fork join from operator op{node_index} {op1.name} -> {random_node.name}")
 
             else:
                 random_nodes = []
@@ -291,10 +292,10 @@ def build_graph(self, test_context: RandomizerTestContext):
                         # Limit number of same inputs on same node
                         if node_connected:
                             if not same_inputs_rate_limitter.is_allowed():
-                                logger.trace(f"Skipping same input node connection op{node_index} {node.get_name()} -> {closing_node.get_name()}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                                logger.trace(f"Skipping same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
                                 continue
                             else:
-                                logger.trace(f"Allowed same input node connection op{node_index} {node.get_name()} -> {closing_node.get_name()}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                                logger.trace(f"Allowed same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
                         closing_node.inputs[open_input_index] = node
                         node_connected = True
 
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index a3fb25db..85d150d7 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -38,20 +38,25 @@ def __post_init__(self):
         # Inputs will be set later during graph construction
         self.inputs = [None for _ in range(self.operator.input_num)]
 
+    @property
     def operator_name(self):
         return f"op{self.index}"
 
+    @property
     def layer_name(self):
         return f"l{self.index}"
 
+    @property
     def node_name(self):
-        return self.operator_name() if self.operator.is_operator() else self.layer_name()
+        return self.operator_name if self.operator.is_operator else self.layer_name
 
-    def get_name(self):
+    @property
+    def name(self):
         return self.operator.name
 
+    @property
     def node_info(self):
-        return f"{self.node_name()} {self.get_name()}"
+        return f"{self.node_name} {self.name}"
 
 
 @dataclass
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index bb8be3d1..55eaf50d 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -17,8 +17,8 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
     def __init__(self, module_name: str = "Buda Test GeneratedTestModel_{{ test_id }}"):
         super(GeneratedTestModel_{{ test_index }}_{{ random_seed }}, self).__init__(module_name)
         self.testname = "Operator Test GeneratedTestModel_{{ test_id }}"
-{% for node in graph.nodes %}{% if node.operator.is_layer() %}        
-        self.{{ node.layer_name() }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
+{% for node in graph.nodes %}{% if node.operator.is_layer %}        
+        self.{{ node.layer_name }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
 
     def forward(self{% for node in graph.input_nodes %},
             {{ node.out_value }}: pybuda.Tensor{% endfor %}
@@ -27,9 +27,9 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
         inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
-        print(f"{{ node.layer_name() }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer() %}
-        {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
-        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name() }}', {{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
+        print(f"{{ node.layer_name }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer %}
+        {{ node.out_value }} = self.{{ node.layer_name }}(inputs[0]){% else %}
+        {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name }}', {{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
         assert {{ node.out_value }}.shape.dims == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
 
         return v
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index 0e616a6a..11e56b05 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -15,8 +15,8 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
 
     def __init__(self):
         super(GeneratedTestModel_{{ test_index }}_{{ random_seed }}, self).__init__()
-{% for node in graph.nodes %}{% if node.operator.is_layer() %}        
-        self.{{ node.layer_name() }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
+{% for node in graph.nodes %}{% if node.operator.is_layer %}        
+        self.{{ node.layer_name }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
 
     def forward(self{% for node in graph.input_nodes %},
             {{ node.out_value }}: torch.Tensor{% endfor %}
@@ -25,8 +25,8 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
         inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
-        print(f"{{ node.layer_name() }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer() %}
-        {{ node.out_value }} = self.{{ node.layer_name() }}(inputs[0]){% else %}
+        print(f"{{ node.layer_name }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer %}
+        {{ node.out_value }} = self.{{ node.layer_name }}(inputs[0]){% else %}
         {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}({{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
         assert {{ node.out_value }}.shape == {{ reduce_microbatch_size(node.output_shape) }}, f"Unexpected output shape of {{ node.out_value }} { {{ node.out_value }}.shape } <> {{ reduce_microbatch_size(node.output_shape) }}"{% endif %}{% endfor %}
 
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 316d9b94..860a8c60 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -148,7 +148,7 @@ def get_input_shapes(cls, graph: RandomizerGraph) -> List[TensorShape]:
 
     @classmethod
     def to_ops_str(cls, graph: RandomizerGraph) -> str:
-        ops = [node.get_name() for node in graph.nodes]
+        ops = [node.name for node in graph.nodes]
         ops_str = " -> ".join(ops)
         return ops_str
 

From aa033248829cd06392604d8bc00ef84f239c9447 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 4 Jul 2024 15:46:29 +0000
Subject: [PATCH 029/116] Fix randomize size

Avoid covergence of random size to 1

Issue #2755

(cherry picked from commit 31d4254501278200b83591570e6bdb4e44a26c96)
---
 pybuda/pybuda/op_repo/shapes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pybuda/pybuda/op_repo/shapes.py b/pybuda/pybuda/op_repo/shapes.py
index ebe42814..775d1dc6 100644
--- a/pybuda/pybuda/op_repo/shapes.py
+++ b/pybuda/pybuda/op_repo/shapes.py
@@ -56,4 +56,8 @@ def randomize_size(n: int, rng_shape: Random) -> int:
     Returns:
         int: random size of an dimension
     '''
-    return n + (rng_shape.randint(0, 1) * 2 - 1) * rng_shape.randint(0, n // 2)
+    range = n // 2
+    diff = rng_shape.randint(-1 * range, max(range, 1))
+    new_value = n + diff
+    # logger.trace(f"Randomize size: {n} + {diff} -> {new_value}")
+    return new_value

From 0621b057cf204b20bbce546a14b5c41aba274e17 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 3 Jul 2024 12:19:38 +0000
Subject: [PATCH 030/116] Constant input for RGG graphs

Adding constant input for RGG graphs according to CONSTANT_INPUT_RATE
Constants are created during 2 phases: graph building and connecting external inputs to distribute equally.

Issue #2755

(cherry picked from commit 906fb9d9b9aed9737d16cfb7be7e1212c2268422)
---
 pybuda/test/README.debug.md                   |  1 +
 pybuda/test/random/rgg/__init__.py            |  2 +
 pybuda/test/random/rgg/algorithms.py          | 98 +++++++++++++------
 pybuda/test/random/rgg/base.py                |  2 +-
 pybuda/test/random/rgg/config.py              |  1 +
 pybuda/test/random/rgg/datatypes.py           | 14 ++-
 .../random/rgg/pybuda/generated_model.jinja2  |  5 +-
 .../random/rgg/pytorch/generated_model.jinja2 |  4 +-
 8 files changed, 94 insertions(+), 33 deletions(-)

diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index 6753e5aa..c2d735b6 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -13,4 +13,5 @@
  * NUM\_OF\_NODES\_MIN: Minimal number of nodes to be generated by RGG. (default: 5)
  * NUM\_OF\_NODES\_MAX: Maximum number of nodes to be generated by RGG. (default: 10)
  * NUM\_OF\_FORK\_JOINS\_MAX: Maximum number of fork joins to be generated by random graph algorithm in RGG. (default: 50)
+ * CONSTANT\_INPUT\_RATE: Rate of constant inputs in RGG in percents. (default: 50)
  * SAME\_INPUTS\_PERCENT\_LIMIT: Percent limit of nodes which have same value on multiple inputes. (default: 10)
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index 7a555765..8817247d 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -4,6 +4,7 @@
 
 
 from .datatypes import TensorShape
+from .datatypes import RandomizerConstantNode
 from .datatypes import RandomizerInputNode, RandomizerNode, ExecutionContext, RandomizerParameters, RandomizerGraph, RandomizerConfig
 from .datatypes import RandomizerTestContext
 from .config import get_randomizer_config_default
@@ -17,6 +18,7 @@
 
 __all__ = [
     "TensorShape",
+    "RandomizerConstantNode",
     "RandomizerInputNode",
     "RandomizerNode",
     "ExecutionContext",
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 931d78b5..46d770f6 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -11,6 +11,7 @@
 
 from .datatypes import RandomizerGraph, RandomizerTestContext
 from .datatypes import RandomizerInputNode
+from .datatypes import RandomizerConstantNode
 from .base import RandomizerNode, GraphBuilder
 from .base import Framework
 from .utils import RandomUtils, StrUtils, NodeUtils
@@ -51,6 +52,7 @@ def init_nodes(cls, test_context: RandomizerTestContext):
         rng_shape = test_context.rng_shape
         rng_params = test_context.rng_params
 
+        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
         same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
 
         # Setting node.index
@@ -65,7 +67,7 @@ def init_nodes(cls, test_context: RandomizerTestContext):
             # setting default output variable name
             node.out_value = "v"
             for input_node in node.inputs:
-                if (input_node is not None and not NodeUtils.is_previous_node(node, input_node)) or cls.always_unique_variables:
+                if input_node is not None and not input_node.constant and (not NodeUtils.is_previous_node(node, input_node) or cls.always_unique_variables):
                     # overriding default output variable name
                     input_node.out_value = input_node.operator_name
                     logger.trace(f"Set out_value = {input_node.out_value}")
@@ -82,39 +84,55 @@ def init_nodes(cls, test_context: RandomizerTestContext):
             for open_input_index in NodeUtils.get_open_input_indices(node):
                 input_shape = input_shapes[open_input_index]
 
-                # list of all graph input nodes with the same shape as the input shape
-                input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shape]
-                # list of input nodes with the same shape that are not already connected to the node
-                input_nodes_with_same_shape_unused = [input_node for input_node in input_nodes_with_same_shape if input_node not in used_input_nodes]
-                if len(input_nodes_with_same_shape_unused) > 0:
-                    # reuse existing input node with the same shape that is not already connected to the node
-                    input_node = input_nodes_with_same_shape_unused[0]
-                    used_input_nodes.append(input_node)
+                # There must be at least one input node for forward method
+                if len(graph.input_nodes) > 0 and constant_input_rate_limitter.is_allowed():
+                    # Creates a new constant node with the same shape
+                    constant_node = RandomizerConstantNode(out_value=None, input_shape=input_shape)
+                    logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limitter.limit_info()}")
+                    # Stores the new constant node in the graph constant nodes
+                    graph.constant_nodes.append(constant_node)
+                    input_node = constant_node
                 else:
-                    # there are no input nodes with the same shape that are not already connected to the node
-                    # check if same input value is allowed
-                    # there must be at least one input node with the same shape to allow repeat
-                    allow_repeat = len(input_nodes_with_same_shape) > 0
-
-                    if allow_repeat:
-                        if not same_inputs_rate_limitter.is_allowed():
-                            logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
-                            allow_repeat = False
-
-                    if allow_repeat:
-                        input_node = rng_shape.choice(input_nodes_with_same_shape)
-                        logger.trace(f"Allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
-                    
-                    else:
-                        # create a new input node with the same shape since there are no unused input nodes with the same shape or repeat is not allowed
-                        input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}", input_shape=input_shape)
+                    # list of all graph input nodes with the same shape as the input shape
+                    input_nodes_with_same_shape = [input_node for input_node in graph.input_nodes if input_node.input_shape == input_shape]
+                    # list of input nodes with the same shape that are not already connected to the node
+                    input_nodes_with_same_shape_unused = [input_node for input_node in input_nodes_with_same_shape if input_node not in used_input_nodes]
+                    if len(input_nodes_with_same_shape_unused) > 0:
+                        # reuse existing input node with the same shape that is not already connected to the node
+                        input_node = input_nodes_with_same_shape_unused[0]
                         used_input_nodes.append(input_node)
-                        # store the new input node in the graph input nodes
-                        graph.input_nodes.append(input_node)
-                
+                    else:
+                        # there are no input nodes with the same shape that are not already connected to the node
+                        # check if same input value is allowed
+                        # there must be at least one input node with the same shape to allow repeat
+                        allow_repeat = len(input_nodes_with_same_shape) > 0
+
+                        if allow_repeat:
+                            if not same_inputs_rate_limitter.is_allowed():
+                                logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                                allow_repeat = False
+
+                        if allow_repeat:
+                            input_node = rng_shape.choice(input_nodes_with_same_shape)
+                            logger.trace(f"Allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                        
+                        else:
+                            # create a new input node with the same shape since there are no unused input nodes with the same shape or repeat is not allowed
+                            input_node = RandomizerInputNode(out_value=f"in_value{len(graph.input_nodes)+1}", input_shape=input_shape)
+                            used_input_nodes.append(input_node)
+                            # store the new input node in the graph input nodes
+                            graph.input_nodes.append(input_node)
+                    
                 # connect the input node to the open node input
                 node.inputs[open_input_index] = input_node
 
+        # Assign constant node values after connecting inputs
+        iconst_index = 0
+        for i, constant_node in enumerate(graph.constant_nodes):
+            if constant_node.out_value is None:
+                iconst_index += 1
+                constant_node.out_value = f"iconst{iconst_index}"
+
         logger.trace("Generating random settings for operator parameters")
         # Generate random values for operator parameters
         for node in nodes:
@@ -212,6 +230,7 @@ def build_graph(self, test_context: RandomizerTestContext):
         fork_join_counter = 0
         fork_join_max = test_context.randomizer_config.num_fork_joins_max
 
+        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
         same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
 
         # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
@@ -301,6 +320,27 @@ def build_graph(self, test_context: RandomizerTestContext):
 
             nodes.insert(0, node)
 
+            # Connecting constants randomly to current node inputs
+            open_nodes = NodeUtils.get_open_nodes(nodes)
+            open_nodes_count = len(open_nodes)
+            input_shapes = node.input_shapes
+            for open_input_index in NodeUtils.get_open_input_indices(node):
+                input_shape = input_shapes[open_input_index]
+                # Skip connecting constant input for last open input to avoid disconnected graph
+                if open_nodes_count > 1 or NodeUtils.num_of_open_inputs(node) > 1:
+                    if constant_input_rate_limitter.is_allowed():
+                        # Creates a new constant node with the same shape
+                        constant_node = RandomizerConstantNode(out_value=None, input_shape=input_shape)
+                        logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limitter.limit_info()}")
+                        # Stores the new constant node in the graph constant nodes
+                        graph.constant_nodes.insert(0, constant_node)
+                        # Connects the input node to the open node input
+                        node.inputs[open_input_index] = constant_node
+
+        # Assign constant node values
+        for i, constant_node in enumerate(graph.constant_nodes):
+            constant_node.out_value = f"nconst{i+1}"
+
         logger.trace(f"Graph built with {len(nodes)} nodes")
 
         logger.trace("Preparing graph")
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 8dbe204b..890688dd 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -265,7 +265,7 @@ def run(self, graph_builder: GraphBuilder):
             # saving test source code to file for debugging purposes
             self.save_test(test_code_str, failing_test=False)
 
-        logger.debug(f"Graph built in: {graph_duration.get_duration():.4f} seconds")
+        logger.info(f"Graph built in: {graph_duration.get_duration():.4f} seconds")
 
         if randomizer_config.run_test:
             # instantiate PyBuda model
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index a50a6ff7..6eec52e8 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -33,6 +33,7 @@ def get_randomizer_config_default():
         num_of_nodes_min=int(os.environ.get("NUM_OF_NODES_MIN", 5)),
         num_of_nodes_max=int(os.environ.get("NUM_OF_NODES_MAX", 10)),
         num_fork_joins_max=int(os.environ.get("NUM_OF_FORK_JOINS_MAX", 50)),
+        constant_input_rate=int(os.environ.get("CONSTANT_INPUT_RATE", 20)),
         same_inputs_percent_limit=int(os.environ.get("SAME_INPUTS_PERCENT_LIMIT", 10)),
     )
     return randomizer_config
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 85d150d7..afa2afbb 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -4,7 +4,7 @@
 # Generic test model randomizer
 
 
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Final, Tuple
 from dataclasses import dataclass, field
 import random
 import torch
@@ -16,14 +16,24 @@
 # Defining a type for tensor shape
 TensorShape = Tuple[int, ...]
 
+
 @dataclass
 class RandomizerInputNode:
+    constant: Final[bool] = field(default=False, init=False)
+    out_value: str
+    input_shape: TensorShape
+
+
+@dataclass
+class RandomizerConstantNode:
+    constant: Final[bool] = field(default=True, init=False)
     out_value: str
     input_shape: TensorShape
 
 
 @dataclass
 class RandomizerNode:
+    constant: Final[bool] = field(default=False, init=False)
     index: Optional[int] = None
     out_value: Optional[str] = None
     operator: Optional[OperatorDefinition] = None
@@ -82,6 +92,7 @@ class RandomizerGraph:
     # parameters: RandomizerParameters
     nodes: List[RandomizerNode] = field(default_factory=list)
     input_nodes: List[RandomizerInputNode] = field(default_factory=list)
+    constant_nodes: List[RandomizerConstantNode] = field(default_factory=list)
     # graph_builder: Optional[str] = None
 
 
@@ -105,6 +116,7 @@ class RandomizerConfig:
     num_of_nodes_min: int = 5
     num_of_nodes_max: int = 10
     num_fork_joins_max: int = 50
+    constant_input_rate: int = 20
     same_inputs_percent_limit: int = 10
 
 
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index 55eaf50d..b362765f 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -19,6 +19,9 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
         self.testname = "Operator Test GeneratedTestModel_{{ test_id }}"
 {% for node in graph.nodes %}{% if node.operator.is_layer %}        
         self.{{ node.layer_name }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
+        {% for constant_node in graph.constant_nodes %}
+        self.add_constant("{{ constant_node.out_value }}")
+        self.set_constant("{{ constant_node.out_value }}", torch.randn({{ reduce_microbatch_size(constant_node.input_shape) }})){% endfor %}
 
     def forward(self{% for node in graph.input_nodes %},
             {{ node.out_value }}: pybuda.Tensor{% endfor %}
@@ -26,7 +29,7 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(PyBudaModule):
         {% for node in graph.nodes %}
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
-        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
+        inputs = [{% for input_node in node.inputs %}{% if input_node.constant %}self.get_constant("{{ input_node.out_value }}"){% else %}{{ input_node.out_value }}{% endif %}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
         print(f"{{ node.layer_name }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer %}
         {{ node.out_value }} = self.{{ node.layer_name }}(inputs[0]){% else %}
         {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}('{{ node.node_name }}', {{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index 11e56b05..dcbfabdb 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -17,6 +17,8 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
         super(GeneratedTestModel_{{ test_index }}_{{ random_seed }}, self).__init__()
 {% for node in graph.nodes %}{% if node.operator.is_layer %}        
         self.{{ node.layer_name }} = {{ node.operator.full_name }}({{ constructor_kwargs(node=node) }}){% endif %}{% endfor %}
+        {% for constant_node in graph.constant_nodes %}
+        self.{{ constant_node.out_value }} = torch.randn({{ reduce_microbatch_size(constant_node.input_shape) }}){% endfor %}
 
     def forward(self{% for node in graph.input_nodes %},
             {{ node.out_value }}: torch.Tensor{% endfor %}
@@ -24,7 +26,7 @@ class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):
         {% for node in graph.nodes %}
 
         # shapes: {{ node.input_shapes }} -> {{ node.output_shape }}
-        inputs = [{% for input_node in node.inputs %}{{ input_node.out_value }}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
+        inputs = [{% for input_node in node.inputs %}{% if input_node.constant %}self.{{ input_node.out_value }}{% else %}{{ input_node.out_value }}{% endif %}{% if not loop.last %}, {% endif %}{% endfor %}]{% if randomizer_config.debug_shapes %}
         print(f"{{ node.layer_name }} inputs: {DebugUtils.format_tensors(inputs)}"){% endif %}{% if node.operator.is_layer %}
         {{ node.out_value }} = self.{{ node.layer_name }}(inputs[0]){% else %}
         {{ node.out_value }} = {% if node.operator.forward_code %}{{node.operator.forward_code()}}{% else %}{{ node.operator.full_name }}({{ forward_args(node=node) }}, {{ forward_kwargs(node=node) }}){% endif %}{% endif %}{% if randomizer_config.verify_shapes %}

From bb445e2d4ce1eb0de07555804eecbeb7a05860e7 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 11 Jul 2024 12:04:21 +0000
Subject: [PATCH 031/116] Operator tests utils

Issue #2554 / #2787

(cherry picked from commit 40cd5d66793f7d1bc7ebf6944c7fd374a7b88dbf)
---
 pybuda/pybuda/op_repo/__init__.py        |   3 +-
 pybuda/test/operators/nary/test_stack.py | 450 +++++++++++------------
 pybuda/test/operators/utils/__init__.py  |  17 +
 pybuda/test/operators/utils/utils.py     | 110 ++++++
 pybuda/test/random/rgg/base.py           |   7 +-
 5 files changed, 346 insertions(+), 241 deletions(-)
 create mode 100644 pybuda/test/operators/utils/utils.py

diff --git a/pybuda/pybuda/op_repo/__init__.py b/pybuda/pybuda/op_repo/__init__.py
index 7d97e709..8a21de90 100644
--- a/pybuda/pybuda/op_repo/__init__.py
+++ b/pybuda/pybuda/op_repo/__init__.py
@@ -11,11 +11,12 @@
 #  - TVM python_codegen.py
 
 
-from .datatypes import OperatorParam, OperatorParamNumber, OperatorDefinition, OperatorRepository
+from .datatypes import TensorShape, OperatorParam, OperatorParamNumber, OperatorDefinition, OperatorRepository
 from .pybuda_operators import pybuda_operator_repository
 from .pytorch_operators import pytorch_operator_repository
 
 __ALL__ = [
+    "TensorShape",
     "OperatorParam",
     "OperatorParamNumber",
     "OperatorDefinition",
diff --git a/pybuda/test/operators/nary/test_stack.py b/pybuda/test/operators/nary/test_stack.py
index 532a502a..3540b6c9 100644
--- a/pybuda/test/operators/nary/test_stack.py
+++ b/pybuda/test/operators/nary/test_stack.py
@@ -61,10 +61,33 @@
 import pybuda.tensor
 import torch
 
-from pybuda import PyBudaModule, VerifyConfig
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify import TestKind, verify_module
-from test.operators.utils import netlist_utils
+from typing import List, Dict
+from loguru import logger
+
+from pybuda import PyBudaModule
+from pybuda.op_repo import TensorShape
+from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import ShapeUtils
+from test.conftest import TestDevice
+
+
+def verify(model: PyBudaModule, test_device: TestDevice, input_shape: TensorShape, number_of_operands: int, input_params: List[Dict] = [], input_source_flag: InputSourceFlags = None, dev_data_format: pybuda.DataFormat = None, math_fidelity: pybuda.MathFidelity = None):
+    '''Common verification function for all tests'''
+
+    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
+    logger.trace(f"***input_shapes: {input_shapes}")
+
+    if input_source_flag:
+        CompilerUtils.set_input_source(input_source_flag.value)
+
+    if math_fidelity:
+        CompilerUtils.set_math_fidelity(math_fidelity)
+
+    if dev_data_format:
+        input_params.append({"dev_data_format": dev_data_format})
+
+    VerifyUtils.verify(model, test_device, input_shapes, input_params)
+
 
 # Currently, verify_module for the Stack operator and Stack operator by it self
 # works only in case of axis = 1. This test demonstrate this case. 
@@ -94,17 +117,12 @@ def forward(self, x, y):
                 return result
             
         mod = Model("test_stack_invalid_axis_model")
-        input_shapes = tuple([input_shape for _ in range(2)])
-        print(f"***input_shapes: {input_shapes}")
     
-        verify_module(
-            mod,
-            input_shapes=input_shapes,
-            verify_cfg=VerifyConfig(
-                test_kind=TestKind.INFERENCE,
-                devtype=test_device.devtype,
-                arch=test_device.arch,
-            ), 
+        verify(
+            model=mod,
+            test_device=test_device,
+            input_shape=input_shape,
+            number_of_operands=2,
         )
 
 
@@ -167,16 +185,12 @@ def forward(self, x, y):
                 return pybuda.op.Stack("Stack0", x, y, axis=axis)
             
         mod = Model("test_stack_invalid_shape_model")
-        input_shapes = tuple([input_shape for _ in range(2)])
     
-        verify_module(
-            mod,
-            input_shapes=input_shapes,
-            verify_cfg=VerifyConfig(
-                test_kind=TestKind.INFERENCE,
-                devtype=test_device.devtype,
-                arch=test_device.arch,
-            ), 
+        verify(
+            model=mod,
+            test_device=test_device,
+            input_shape=input_shape,
+            number_of_operands=2,
         )
 
 
@@ -202,7 +216,7 @@ def get_input_shapes(microbatch_size=1):
 #   2.1 From another op
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_another_operand(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_another_operand(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -216,21 +230,14 @@ def forward(self, x, y):
             return output
         
     mod = Model("test_stack_inputs_from_another_operand_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -238,7 +245,7 @@ def forward(self, x, y):
 #    - Combination: operator -> tm -> input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_tm_edge1(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_tm_edge1(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -251,21 +258,14 @@ def forward(self, x, y):
             return v3
         
     mod = Model("test_stack_inputs_from_tm_edge1_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -273,7 +273,7 @@ def forward(self, x, y):
 #    - tm -> input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_tm_edge2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_tm_edge2(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -286,21 +286,14 @@ def forward(self, x, y):
             return v3
         
     mod = Model("test_stack_inputs_from_tm_edge2_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -308,7 +301,7 @@ def forward(self, x, y):
 #    - input_queue flag = false
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_dram_queue(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_dram_queue(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -318,46 +311,41 @@ def forward(self, x, y):
             return pybuda.op.Stack("Stack0", x, y, axis=axis)
         
     mod = Model("test_stack_inputs_from_dram_queue_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = False
-    if(math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        input_source_flag=InputSourceFlags.FROM_DRAM,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+
+    file_path = VerifyUtils.get_netlist_filename()
     assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
     assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
 
 
 def get_input_shapes_prologued():
                                               # Here we cover interesting combinations of input shapes:
-    return [((2, 3, 3),      True, False),  #0        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 3, 3),      False, True),  #1        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 3, 3),      None, True),   #2        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 3),      True, False),  #3        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 3),      False, True),  #4        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 3),      None, True),   #5 !!!    # 3.1 Full tensor (i.e. full expected shape) - not according to documentation!
-            ((2, 10, 5),     None, True),   #6        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 1, 15),     None, True),   #7        # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 50, 1),     None, True),   #8        # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 100, 100),  None, True),   #9        # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 100, 1000), None, False),  #10       # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 1, 10000),  None, False),  #11       # 4.4 Extreme ratios between height/width
-            ((2, 10000, 1),  None, False),  #12       # 4.4 Extreme ratios between height/width
-            ((2, 32, 32),    None, True),   #13       # 4.1 Divisible by 32
-            ((2, 96, 96),    None, True),   #14       # 4.1 Divisible by 32
-            ((2, 13, 97),    None, True),   #15       # 4.2 Prime numbers
+    # Columns: input_shape, input_source_flag, should_prolog"
+    return [((2, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #0        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #1        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #2        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #3        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #4        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #5 !!!    # 3.1 Full tensor (i.e. full expected shape) - not according to documentation!
+            ((2, 10, 5),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #6        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 15),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #7        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 50, 1),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #8        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 100, 100),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #9        # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 100, 1000), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #10       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 10000),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #11       # 4.4 Extreme ratios between height/width
+            ((2, 10000, 1),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #12       # 4.4 Extreme ratios between height/width
+            ((2, 32, 32),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #13       # 4.1 Divisible by 32
+            ((2, 96, 96),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #14       # 4.1 Divisible by 32
+            ((2, 13, 97),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #15       # 4.2 Prime numbers
             ]
 
 
@@ -365,8 +353,8 @@ def get_input_shapes_prologued():
 #    - Constants must be small enough to fit into L1
 #    - Input are not prologued for microbatch size = 1
 @pytest.mark.parametrize("axis", axises)
-@pytest.mark.parametrize("input_shape, default_dram_params, should_prolog", get_input_shapes_prologued())
-def test_stack_inputs_from_dram_prologued(test_device, axis, input_shape, default_dram_params, should_prolog, input_params=[], math_fidelity=None):
+@pytest.mark.parametrize("input_shape, input_source_flag, should_prolog", get_input_shapes_prologued())
+def test_stack_inputs_from_dram_prologued(test_device, axis, input_shape, input_source_flag, should_prolog, dev_data_format=None, math_fidelity=None):
     
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -375,8 +363,7 @@ def __init__(self, name):
             def my_rand(*shape, requires_grad=False):
                 return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
 
-            t = input_shape[1:]
-            self.shape_input = (1, *t)
+            self.shape_input = ShapeUtils.reduce_microbatch_size(input_shape)
 
             self.add_constant("c")
             self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
@@ -387,23 +374,17 @@ def forward(self, x):
         
     mod = Model("test_stack_inputs_from_dram_prologued_model")
 
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.default_dram_parameters = default_dram_params
-    compiler_cfg.input_queues_on_host = False
-    if(math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=[input_shape],
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=1,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+
+    file_path = VerifyUtils.get_netlist_filename()
     d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Stack0")
     if should_prolog:
         assert d['prologue']
@@ -414,7 +395,7 @@ def forward(self, x):
 #   2.5 Const Inputs (const eval pass)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_constants(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_constants(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
      
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -423,7 +404,7 @@ def __init__(self, name):
             def my_rand(*shape, requires_grad=False):
                 return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
 
-            self.shape_input = input_shape
+            self.shape_input = ShapeUtils.reduce_microbatch_size(input_shape)
 
             self.add_constant("c1")
             self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
@@ -443,24 +424,18 @@ def forward(self, x, y):
             return v3
 
     mod = Model("test_stack_inputs_from_constants_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     # Here we check there is no key with "Stack" in the netlist in graphs section
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    file_path = VerifyUtils.get_netlist_filename()
     d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
     for key in d.keys():
         assert "Stack" not in key
@@ -469,7 +444,7 @@ def forward(self, x, y):
 #   2.6 From host - case of two tensors as input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_stack_inputs_from_host_2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_host_2(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -479,21 +454,14 @@ def forward(self, x, y):
             return pybuda.op.Stack("Stack0", x, y, axis=axis)
         
     mod = Model("test_stack_inputs_from_host_2_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -501,7 +469,7 @@ def forward(self, x, y):
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
 @pytest.mark.parametrize("number_of_operands", [3, 7, 15])
-def test_stack_inputs_from_host_multiple_operands(test_device, axis, input_shape, number_of_operands, input_params=[], math_fidelity=None):
+def test_stack_inputs_from_host_multiple_operands(test_device, axis, input_shape, number_of_operands, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -511,21 +479,14 @@ def forward(self, *x):
             return pybuda.op.Stack("Stack0", *x, axis=axis)
         
     mod = Model("test_stack_inputs_from_host_multiple_operands")
-    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
 
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        model=mod,
+        test_device=test_device,
+        input_shape=input_shape,
+        number_of_operands=number_of_operands,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -544,9 +505,9 @@ def get_single_shape(microbatch_size=1):
 ### 1. ####################################################################################
 
 #   5.4 Operand DFs
-verify_input_params=[ 
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                    ]
+dev_data_formats = [
+    pybuda.DataFormat.Float16_b,
+]
 
 #  6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
 compiler_math_fidelity = [
@@ -560,106 +521,125 @@ def get_single_shape(microbatch_size=1):
 # Unfortunatelly, we can't call all test functions in just one test, because
 # reset of the compiler configuration and device state is not possible.
 
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_stack_mf_inputs_from_another_operand(test_device, math_fidelity):
-    test_stack_inputs_from_another_operand(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+def test_stack_mf_inputs_from_another_operand(test_device, dev_data_format, math_fidelity):
+    test_stack_inputs_from_another_operand(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_tm_edge1(test_device, math_fidelity):
-#     test_stack_inputs_from_tm_edge1(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+# def test_stack_mf_from_tm_edge1(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_tm_edge1(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_tm_edge2(test_device, math_fidelity):
-#     test_stack_inputs_from_tm_edge2(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+# def test_stack_mf_from_tm_edge2(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_tm_edge2(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_dram_queue(test_device, math_fidelity):
-#     test_stack_inputs_from_dram_queue(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+# def test_stack_mf_from_dram_queue(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_dram_queue(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_dram_prologued(test_device, math_fidelity):
-#     test_stack_inputs_from_dram_prologued(test_device, axis, get_single_shape(microbatch_size=2), verify_input_params, math_fidelity)
+# def test_stack_mf_from_dram_prologued(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_dram_prologued(test_device, axis, get_single_shape(microbatch_size=2), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True, dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_constants(test_device, math_fidelity):
-#     test_stack_inputs_from_constants(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+# def test_stack_mf_from_constants(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_constants(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_host_2(test_device, math_fidelity):
-#     test_stack_inputs_from_host_2(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+# def test_stack_mf_from_host_2(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_host_2(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_stack_mf_from_host_multiple_operands(test_device, math_fidelity):
-#     test_stack_inputs_from_host_multiple_operands(test_device, axis, get_single_shape(), 3, verify_input_params, math_fidelity)
+# def test_stack_mf_from_host_multiple_operands(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_host_multiple_operands(test_device, axis, get_single_shape(), 3, dev_data_format, math_fidelity)
 
 
 ### 2. ####################################################################################
 
 #   5.4 Operand DFs
-verify_input_params=[
-                        {"dev_data_format": pybuda.DataFormat.Bfp2},
-                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
-                        {"dev_data_format": pybuda.DataFormat.Float16},  
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                        {"dev_data_format": pybuda.DataFormat.Float32},
-                        {"dev_data_format": pybuda.DataFormat.Int8},
-                        {"dev_data_format": pybuda.DataFormat.Lf8},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
-                        {"dev_data_format": pybuda.DataFormat.UInt16},
-                    ]
+dev_data_formats=[
+    pybuda.DataFormat.Bfp2,
+    pybuda.DataFormat.Bfp2_b,
+    pybuda.DataFormat.Bfp4,
+    pybuda.DataFormat.Bfp4_b,
+    pybuda.DataFormat.Bfp8,
+    pybuda.DataFormat.Bfp8_b,
+    pybuda.DataFormat.Float16,
+    pybuda.DataFormat.Float16_b,
+    pybuda.DataFormat.Float32,
+    pybuda.DataFormat.Int8,
+    pybuda.DataFormat.Lf8,
+    pybuda.DataFormat.RawUInt16,
+    pybuda.DataFormat.RawUInt32,
+    pybuda.DataFormat.RawUInt8,
+    pybuda.DataFormat.UInt16,
+]
 
 #  6. Math fidelity
-compiler_math_fidelity = pybuda.MathFidelity.HiFi4
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
 
 
-@pytest.mark.parametrize("input_params", verify_input_params)
-def test_stack_df_inputs_from_another_operand(test_device, input_params):
-    test_stack_inputs_from_another_operand(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_stack_df_inputs_from_another_operand(test_device, dev_data_format, math_fidelity):
+    test_stack_inputs_from_another_operand(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_tm_edge1(test_device, input_params):
-#     test_stack_inputs_from_tm_edge1(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_tm_edge1(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_tm_edge1(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_tm_edge2(test_device, input_params):
-#     test_stack_inputs_from_tm_edge2(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_tm_edge2(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_tm_edge2(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_dram_queue(test_device, input_params):
-#     test_stack_inputs_from_dram_queue(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_dram_queue(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_dram_queue(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_dram_prologued(test_device, input_params):
-#     test_stack_inputs_from_dram_prologued(test_device, axis, get_single_shape(microbatch_size=2), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_dram_prologued(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_dram_prologued(test_device, axis, get_single_shape(microbatch_size=2), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True, dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_constants(test_device, input_params):
-#     test_stack_inputs_from_constants(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_constants(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_constants(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_host_2(test_device, input_params):
-#     test_stack_inputs_from_host_2(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_host_2(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_host_2(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
+
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_stack_df_from_host_multiple_operands(test_device, dev_data_format, math_fidelity):
+#     test_stack_inputs_from_host_multiple_operands(test_device, axis, get_single_shape(), 3, dev_data_format, math_fidelity)
 
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_stack_df_from_host_multiple_operands(test_device, input_params):
-#     test_stack_inputs_from_host_multiple_operands(test_device, axis, get_single_shape(), 3, input_params, compiler_math_fidelity)
diff --git a/pybuda/test/operators/utils/__init__.py b/pybuda/test/operators/utils/__init__.py
index 2332467e..d9c3b26f 100644
--- a/pybuda/test/operators/utils/__init__.py
+++ b/pybuda/test/operators/utils/__init__.py
@@ -1,3 +1,20 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
+
+from .utils import ShapeUtils
+from .utils import InputSourceFlag, InputSourceFlags
+from .utils import CompilerUtils
+from .utils import VerifyUtils
+from .utils import LoggerUtils
+from .netlist_utils import read_netlist_value
+
+__all__ = [
+    'read_netlist_value',
+    'ShapeUtils',
+    'InputSourceFlag',
+    'InputSourceFlags',
+    'CompilerUtils',
+    'VerifyUtils',
+    'LoggerUtils',
+]
diff --git a/pybuda/test/operators/utils/utils.py b/pybuda/test/operators/utils/utils.py
new file mode 100644
index 00000000..9ce1f6d7
--- /dev/null
+++ b/pybuda/test/operators/utils/utils.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Operator test utilities
+
+import sys
+import pybuda
+
+from enum import Enum
+from dataclasses import dataclass
+from loguru import logger
+from typing import Optional, List, Dict
+
+from pybuda import PyBudaModule, VerifyConfig
+from pybuda.op_repo import TensorShape
+from pybuda.verify import TestKind, verify_module
+from pybuda.config import _get_global_compiler_config
+from pybuda._C import MathFidelity
+from test.conftest import TestDevice
+
+
+class ShapeUtils:
+
+    @staticmethod
+    def reduce_microbatch_size(shape: TensorShape) -> TensorShape:
+        '''
+        Reduce microbatch dimension of a shape to 1
+        Usually used for calculating shape of a constant tensor
+        '''
+        return (1, ) + shape[1:]
+
+
+@dataclass(frozen=True)
+class InputSourceFlag:
+    '''Dataclass for specifying compiler flags for specific input source'''
+    input_queues_on_host: bool
+    set_default_dram_parameters: bool
+    default_dram_parameters: Optional[bool]
+
+
+class InputSourceFlags(Enum):
+    '''Enums defining input source flags'''
+    FROM_HOST = InputSourceFlag(True, False, None)
+    FROM_DRAM = InputSourceFlag(False, False, None)
+    FROM_DRAM_PROLOGUED = InputSourceFlag(False, True, False)
+    FROM_DRAM_NOT_PROLOGUED = InputSourceFlag(False, True, True)
+    FROM_DRAM_PROLOGUE_MICROBATCH_SIZE = InputSourceFlag(False, True, None)
+
+
+class CompilerUtils:
+    '''Utility functions for PyBuda compiler configuration'''
+
+    @staticmethod
+    def set_input_source(input_source_flag: InputSourceFlag):
+        '''Set compiler configuration for input source'''
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.input_queues_on_host = input_source_flag.input_queues_on_host
+        if input_source_flag.set_default_dram_parameters:
+            compiler_cfg.default_dram_parameters = input_source_flag.default_dram_parameters
+
+    @staticmethod
+    def set_math_fidelity(math_fidelity: MathFidelity):
+        '''Set compiler configuration for math fidelity'''
+        compiler_cfg = _get_global_compiler_config()
+        compiler_cfg.default_math_fidelity = math_fidelity
+
+
+class VerifyUtils:
+    '''Utility functions for PyBuda verification'''
+
+    @staticmethod
+    def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[TensorShape], input_params: List[Dict] = []):
+        '''Perform PyBuda verification on the model
+
+        Args:
+            model: PyBuda model
+            test_device: TestDevice
+            input_shapes: List of input shapes
+            input_params: List of input parameters
+        '''
+
+        verify_module(
+            model,
+            input_shapes=input_shapes,
+            verify_cfg=VerifyConfig(
+                test_kind=TestKind.INFERENCE,
+                devtype=test_device.devtype,
+                arch=test_device.arch,
+            ),
+            input_params=[input_params],
+        )
+
+    @staticmethod
+    def get_netlist_filename() -> str:
+        '''Get netlist filename of the last compiled model'''
+        return pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+
+class LoggerUtils:
+    '''Utility functions for logging'''
+
+    @staticmethod
+    def set_log_level(package_name: str, level: str):
+        ''' Set log level for package_name and its subpackages
+
+        Args:
+            package_name (str): package name
+            level (str): log level
+        '''
+        logger.add(sys.stdout, level=level, filter=lambda record: record["name"].startswith(package_name))
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 890688dd..c1f8a841 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -14,11 +14,11 @@
 from pybuda import PyBudaModule
 from pybuda.verify import verify_module, VerifyConfig
 from pybuda.op_repo import OperatorRepository
+from test.operators.utils import ShapeUtils
 from test.conftest import TestDevice
 from test.utils import Timer
 from .datatypes import RandomizerNode, RandomizerGraph, RandomizerParameters, RandomizerConfig, ExecutionContext
 from .datatypes import RandomizerTestContext
-from .datatypes import TensorShape
 from .utils import StrUtils, GraphUtils
 
 
@@ -79,9 +79,6 @@ def forward_args(self, node: RandomizerNode) -> str:
     def forward_kwargs(self, node: RandomizerNode) -> str:
         return StrUtils.kwargs_str(**node.forward_kwargs)
 
-    def reduce_microbatch_size(self, shape: TensorShape) -> str:
-        return (1, ) + shape[1:]
-
     def generate_code(self, test_context: RandomizerTestContext, test_format: bool = True) -> str:
         # TODO setup random seed in generated test function
 
@@ -99,7 +96,7 @@ def generate_code(self, test_context: RandomizerTestContext, test_format: bool =
             constructor_kwargs=self.constructor_kwargs,
             forward_args=self.forward_args,
             forward_kwargs=self.forward_kwargs,
-            reduce_microbatch_size=self.reduce_microbatch_size,
+            reduce_microbatch_size=ShapeUtils.reduce_microbatch_size,
             ExecutionContext=ExecutionContext,
             )
 

From eb8ec5ad1c6f92468ee2bafa796ed9075b116ad0 Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Tue, 9 Jul 2024 13:31:53 +0000
Subject: [PATCH 032/116] Remove models nlp pytorch

(cherry picked from commit 7a13d41cce77f54c5b5ece093f0298f0cf4ca147)
---
 .../tvm/nlp/pytorch/tests_A/test_albert.py    | 140 ----
 .../test/tvm/nlp/pytorch/tests_A/test_bert.py | 171 -----
 .../test/tvm/nlp/pytorch/tests_A/test_detr.py | 131 ----
 .../tvm/nlp/pytorch/tests_A/test_t5_small.py  |  91 ---
 .../test/tvm/nlp/pytorch/tests_A/test_xlm.py  | 234 -------
 .../nlp/pytorch/tests_B/test_distilbert.py    | 247 -------
 .../test/tvm/nlp/pytorch/tests_B/test_wmt.py  | 114 ----
 .../test/tvm/nlp/pytorch/tests_C/test_opt.py  |  77 ---
 .../tvm/nlp/pytorch/tests_C/test_roberta.py   | 156 -----
 .../tvm/nlp/pytorch/tests_C/test_trocr.py     |  97 ---
 .../tvm/nlp/pytorch/tests_C/test_unispeech.py |  64 --
 .../tvm/nlp/pytorch/tests_C/test_wav2vec2.py  |  64 --
 .../tvm/nlp/pytorch/tests_D/test_bloom.py     |  29 -
 .../test/tvm/nlp/pytorch/tests_D/test_gpt2.py | 630 ------------------
 .../test/tvm/nlp/pytorch/tests_D/test_gptj.py |  29 -
 .../tvm/nlp/pytorch/tests_D/test_gptneo.py    |  92 ---
 .../tvm/nlp/pytorch/tests_D/test_nbeats.py    | 165 -----
 .../nlp/pytorch/tests_D/test_squeeze_bert.py  |  89 ---
 .../test/tvm/nlp/pytorch/tests_D/test_xglm.py | 219 ------
 .../tvm/nlp/pytorch/tests_E/test_codegen.py   |  69 --
 .../tvm/nlp/pytorch/tests_E/test_whisper.py   | 375 -----------
 21 files changed, 3283 deletions(-)
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_A/test_bert.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_A/test_xlm.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_B/test_distilbert.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_B/test_wmt.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_C/test_opt.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_C/test_roberta.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_D/test_gptneo.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_D/test_nbeats.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_D/test_squeeze_bert.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_D/test_xglm.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
 delete mode 100644 pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py

diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
deleted file mode 100644
index 410c308e..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import configparser
-from distutils.command.config import config
-import pybuda
-import pytest
-
-import torch
-from transformers import AlbertConfig, AlbertModel
-
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    CPUDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-from pybuda.op.eval.common import compare_tensor_to_golden
-from test.utils import download_model
-
-
-class EmbWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.embeddings = model.embeddings
-
-    def forward(self, input_ids, extended_attention_mask):
-        return self.embeddings(input_ids), extended_attention_mask
-
-class EncoderWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.encoder = model.encoder
-        self.pooler = model.pooler
-        self.pooler_activation = model.pooler_activation
-
-    def forward(self, embedding_output, extended_attention_mask):
-        encoder_outputs = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-        )
-        return encoder_outputs
-
-
-@pytest.mark.parametrize("add_pooling_layer", [True, False], ids=["pooling", "no_pooling"])
-@pytest.mark.parametrize("version", ['v1', 'v2'], )
-def test_albert_pipeline(test_device, version, add_pooling_layer):
-    if add_pooling_layer:
-        pytest.skip("Pooling not supported in backend, will result in unsupported sparse_matmul")
-    
-    config = download_model(AlbertConfig.from_pretrained, f"albert-base-{version}", torchscript=True)
-    model = AlbertModel(config, add_pooling_layer=add_pooling_layer)
-    model.eval()
-    
-    albert_embeddings = EmbWrapper(model)
-    albert_encoder = EncoderWrapper(model)
-
-    relative_atol = 0.3 if test_device.is_silicon() else 0.1
-
-    cpu0 = CPUDevice("cpu0", module=PyTorchModule("albert_embeddings", albert_embeddings))
-    tt1 = TTDevice("tt1", devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("albert_encoder", albert_encoder))
-
-    seq_len = 128
-    input_ids = torch.randint(config.vocab_size, (1, seq_len))
-    attention_mask = torch.ones((1, seq_len))
-    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-    extended_attention_mask = extended_attention_mask.to(dtype=model.dtype)  # fp16 compatibility
-    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-    cpu0.push_to_inputs(input_ids, extended_attention_mask)
-    output_q = pybuda.run_inference(_verify_cfg=VerifyConfig(relative_atol=relative_atol))
-    outputs = output_q.get()
-
-    torch_outputs = model(input_ids, attention_mask=attention_mask)
-    assert compare_tensor_to_golden("albert", torch_outputs[0], outputs[0].value(), is_buda=True, relative_atol=relative_atol)
-
-
-def test_albert_v1(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        _get_global_compiler_config().compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    
-    input_shape = (1, 768, 768)
-    
-    model = download_model(AlbertModel.from_pretrained, "albert-base-v1", torchscript=True)
-
-    submodel = model.encoder.albert_layer_groups[0].albert_layers[0].attention
-    mod = PyTorchModule(
-        "albert_attention_pytorch",
-        submodel,
-    )
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"key.bias"},
-        )
-    )
-    # evaluate_framework_vs_pybuda(submodel, res, hidden_states)
-
-def test_albert_v2(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    input_shape = (1, 768, 768)
-
-    model = download_model(AlbertModel.from_pretrained, "albert-base-v2", torchscript=True)
-
-    submodel = model.encoder.albert_layer_groups[0].albert_layers[0].attention
-    mod = PyTorchModule(
-        "albert_attention_pytorch",
-        submodel,
-    )
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"key.bias"},
-        )
-    )
-    # evaluate_framework_vs_pybuda(submodel, res, hidden_states)
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_bert.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_bert.py
deleted file mode 100644
index 4fd1a2dd..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_bert.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import os
-import pytest
-from collections import OrderedDict
-
-import torch
-from torch import nn
-from test.backend.models.test_bert import get_relaxed_atol_pcc
-from pybuda.tensor import to_pt_tensors
-
-from transformers import BertModel, BertConfig, BertForPreTraining
-import pybuda
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.op.eval import compare_tensor_to_golden
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-@pytest.mark.parametrize("size", ["tiny", "base", "large"])
-def test_bert_encoder(test_kind, test_device, size):
-    if size == "tiny":
-        model_name = "prajjwal1/bert-tiny"
-        seq_len = 128
-    elif size == "base":
-        model_name = "bert-base-uncased"
-        seq_len = 128
-    elif size == "large":
-        model_name = "bert-large-uncased"
-        seq_len = 384
-
-    pytest.skip("Full model passes inference and training")
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    config = download_model(BertConfig.from_pretrained, model_name)
-    input_shape = (1, seq_len, config.hidden_size)
-    model = download_model(BertModel.from_pretrained, model_name, torchscript=True)
-
-    submodel = model.encoder
-    mod = PyTorchModule("bert_encoder", submodel)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"layer.0.attention.self.key.bias", "layer.1.attention.self.key.bias"},
-        ),
-        input_params=[{"requires_grad": False}],
-    )
-    # evaluate_framework_vs_pybuda(submodel, ret, hidden_states)
-
-def test_pt_pretrain_heads(test_device):
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    test_device.devtype = BackendType.NoBackend
-    config = download_model(BertConfig.from_pretrained, "prajjwal1/bert-tiny", torchscript=True)
-    bert = BertForPreTraining(config)
-    submodel = bert.cls
-    mod = PyTorchModule("ptheads", submodel)
-    verify_module(
-        mod,
-        ((1, 128, 128), (1, 128)),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        )
-    )
-
-def test_bert_pt_fallback(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-        
-    input_shape = (1, 128)
-    model = download_model(BertModel.from_pretrained, "prajjwal1/bert-tiny", add_pooling_layer=False)
-
-    mod = PyTorchModule("bert", model)
-
-    compiler_cfg = _get_global_compiler_config() 
-    compiler_cfg.retain_tvm_python_files = True
-
-    pcc = 0.9 if test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-            waive_gradient_errors={"layer.0.attention.self.key.bias", "layer.1.attention.self.key.bias"},
-        ),
-        input_params=[{"requires_grad": False, "data_format": torch.int}],
-    )
-
-
-def test_bert_embeddings_fallback(test_kind, test_device):
-    pytest.skip("Full model passes inference and training")
-    class EmbModel(nn.Module):
-        def __init__(self, emb):
-            super().__init__()
-            self.emb = emb
-            self.linear = nn.Linear(128, 32)
-
-        def forward(self, input):
-            embs = self.emb(input)
-            lin = self.linear(embs)
-            return lin
-
-
-    compiler_cfg = _get_global_compiler_config() 
-    input_shape = (1, 32)
-
-    bert = download_model(BertModel.from_pretrained, "prajjwal1/bert-tiny", torchscript=True)    
-    mod = PyTorchModule("bert_embedding", EmbModel(bert.embeddings))
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        input_params=[{"requires_grad": False, "data_format": torch.int}],
-    )
-
-def test_bert_direct_fallback(test_kind, test_device):
-    pytest.skip("Full model passes inference and training")
-
-    compiler_cfg = _get_global_compiler_config() 
-
-    config = download_model(BertConfig.from_pretrained, "prajjwal1/bert-tiny")
-    config.num_hidden_layers = 2
-    model = BertModel(config, add_pooling_layer=False)
-
-    mod = PyTorchModule("bert", model)
-    tt1 = pybuda.TTDevice("tt1",
-            devtype=test_device.devtype, arch=test_device.arch, module=mod)
-    input_shape = (1, 128)
-    input_ids = torch.randint(high=25000, size=input_shape)
-    attention_mask = torch.ones(input_shape)
-
-    tt1.push_to_inputs(input_ids, attention_mask)
-    output_q = pybuda.run_inference(_verify_cfg=VerifyConfig(relative_atol=0.3), _sequential=True)
-    output = to_pt_tensors(output_q.get())[0]
-
-    pt_output = model(input_ids, attention_mask)[0]
-
-    relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device, "tiny", 1)
-    compare_tensor_to_golden("bert_out", pt_output, output, pcc=pcc, relative_atol=relative_atol)
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
index 10ec33df..a1aa84f9 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
@@ -151,140 +151,9 @@ def forward(self, hidden_states):
     )
 
 
-def test_detr_50_backbone_layer(test_kind, test_device):
-    # As full model is running, no need to run dissected sub-modules
-    pytest.skip()
-
-    if test_kind.is_training():
-        pytest.skip()  # TODO: debug data mismatch
-
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    model = download_model(DetrModel.from_pretrained, "facebook/detr-resnet-50", torchscript=True)
-
-    submodel = model.backbone.conv_encoder.model.layer1[0]
-
-    mod = PyTorchModule(
-        "detr50_backbone_layer",
-        submodel,
-    )
-    input_shape = (1, 64, 256, 256)
-    hidden_states = torch.rand(*input_shape)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_detr_50_encoder_layer(test_kind, test_device):
-    # As full model is running, no need to run dissected sub-modules
-    pytest.skip()
-
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class DeTrEncoderWrapper(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.layer = module.encoder.layers[0]
-            self.attn_mask = torch.rand((1, 1, 256, 256))
-            self.pos_emb = torch.rand((1, 256))
-
-        def forward(self, hidden_states):
-            return self.layer(hidden_states, self.attn_mask, self.pos_emb)
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW op: heaviside
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    # Configure PyTorch module
-    pytorch_module = download_model(
-        DetrModel.from_pretrained,
-        "facebook/detr-resnet-50", torchscript=True
-    )
-    pytorch_module = DeTrEncoderWrapper(pytorch_module)
-    module = PyTorchModule(
-        "detr50_encoder_layer",
-        pytorch_module,
-    )
-
-    input_shape = (1, 256, 256)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors=("layer.self_attn.k_proj.bias"),
-        ),
-    )
 
 
-def test_detr_50_decoder_layer(test_kind, test_device):
-    # As full model is running, no need to run dissected sub-modules
-    pytest.skip()
 
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
 
-    class DeTrDecoderWrapper(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.layer = module.decoder.layers[0]
-            self.attn_mask = torch.rand((1, 1, 256, 256))
-            self.pos_emb = torch.rand((1, 256))
-            self.kv_state = torch.rand((1, 1, 256, 256))
 
-        def forward(self, hidden_states):
-            return self.layer(
-                hidden_states, self.attn_mask, self.pos_emb, self.kv_state
-            )
 
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        # Unsupported HW op: heaviside
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    # Configure PyTorch module
-    pytorch_module = download_model(
-        DetrModel.from_pretrained,
-        "facebook/detr-resnet-50", torchscript=True
-    )
-    pytorch_module = DeTrDecoderWrapper(pytorch_module)
-    module = PyTorchModule(
-        "detr50_decoder_layer",
-        pytorch_module,
-    )
-
-    input_shape = (1, 256, 256)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors=("layer.self_attn.k_proj.bias"),
-        ),
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
index b98ce44b..0ba8c1ed 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
@@ -281,47 +281,7 @@ def forward(self, hidden_states):
     outputs = output_q.get()
 
 
-def test_t5_small_fallback(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-    
-    class T5Wrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, input_ids, decoder_input_ids):
-            return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    compiler_cfg.enable_tvm_constant_prop = True
-    
-
-    pretrained_name = "t5-small"
-    config = T5Config.from_pretrained(pretrained_name)
-
-    config.use_cache = False
-    model = T5Model(config)
-    pretrained_model = download_model(T5Model.from_pretrained, pretrained_name)
-    model.load_state_dict(pretrained_model.state_dict())
-    mod = PyTorchModule("t5_small", T5Wrapper(model))
-
-    input_shape = (1, 128)
-    verify_module(
-        mod,
-        (input_shape, input_shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        input_params=[
-            {"requires_grad": False, "data_format": torch.int}, 
-            {"requires_grad": False, "data_format": torch.int},
-        ],
-    )
 
 class BlocksWrapper(torch.nn.Module):
     def __init__(self, model, num_blocks):
@@ -914,54 +874,3 @@ def test_t5_past_cache_model(variant):
     print(f"generated tokens: {tokenizer.decode(generated_tokens)}")
 
 
-def test_t5_small_tiny_tile(test_device):
-        
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull test failing with TM ERROR (producer = matmul_49, consumer = matmul_53): input using kernel_broadcast but post-TM input canonical form is not periodic")
-
-    import os
-    os.environ["PYBUDA_ENABLE_TINY_TILE"] = "1"
-    # Add PyBUDA configurations
-    compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_tvm_cpu_fallback = False
-    compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
-    compiler_cfg.amp_level = 1
-    compiler_cfg.enable_enumerate_u_kt = False
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    compiler_cfg.compile_depth = CompileDepth.POST_PATTERN_MATCHER
-
-    # Load tokenizer and model from HuggingFace
-    # Variants: t5-small, t5-base, t5-large
-    variant = "t5-small"
-    config = download_model(T5Config.from_pretrained, variant)
-    config_dict = config.to_dict()
-    config_dict['return_dict'] = False
-    config_dict['use_cache'] = False
-    config = T5Config(**config_dict)
-    model = download_model(T5ForConditionalGeneration.from_pretrained, variant, config=config)
-
-    # Wrapper to get around attention mask
-    class Wrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-        
-        def forward(self, decoder_input_ids, encoder_outputs):
-            return self.model(None, None, decoder_input_ids, None, None, None, None, (encoder_outputs,))
-
-    tt_model = pybuda.PyTorchModule("t5_small_tiny_tile", Wrapper(model))
-    
-    decoder_input_ids = torch.randint(0, model.config.vocab_size, (1, 1), dtype=torch.int32)
-    encoder_outputs = torch.randn(1, 1, 512)
-
-    verify_module(
-        tt_model,
-        input_shapes=[(decoder_input_ids.shape, encoder_outputs.shape,)],
-        inputs=[(decoder_input_ids, encoder_outputs)],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-            enabled=False
-        )
-    )
\ No newline at end of file
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_xlm.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_xlm.py
deleted file mode 100644
index a5863b79..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_xlm.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-from pybuda.config import CompileDepth
-import pytest
-
-import torch
-import torch.nn as nn
-from transformers.models.xlm import XLMConfig, XLMModel, XLMPreTrainedModel
-
-import math
-import itertools
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-
-class MultiHeadAttention(nn.Module):
-
-    NEW_ID = itertools.count()
-
-    def __init__(self, n_heads, dim, config):
-        super().__init__()
-        self.layer_id = next(MultiHeadAttention.NEW_ID)
-        self.dim = dim
-        self.n_heads = n_heads
-        self.dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0
-
-        self.q_lin = nn.Linear(dim, dim)
-        self.k_lin = nn.Linear(dim, dim)
-        self.v_lin = nn.Linear(dim, dim)
-        self.out_lin = nn.Linear(dim, dim)
-        self.pruned_heads = set()
-
-    def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
-        """
-        Self-attention (if kv is None) or attention over source sentence (provided by kv).
-        """
-        # Input is (bs, qlen, dim)
-        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = input.size()
-        if kv is None:
-            klen = qlen if cache is None else cache["slen"] + qlen
-        else:
-            klen = kv.size(1)
-        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
-        n_heads = self.n_heads
-        dim_per_head = self.dim // n_heads
-        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
-
-        def shape(x):
-            """projection"""
-            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
-
-        def unshape(x):
-            """compute context"""
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
-
-        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        if kv is None:
-            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        elif cache is None or self.layer_id not in cache:
-            k = v = kv
-            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
-
-        if cache is not None:
-            if self.layer_id in cache:
-                if kv is None:
-                    k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                else:
-                    k, v = cache[self.layer_id]
-            cache[self.layer_id] = (k, v)
-
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
-        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
-        scores.masked_fill_(mask, 1e-10)  # (bs, n_heads, qlen, klen)
-
-        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
-        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, qlen, dim)
-
-        outputs = (self.out_lin(context),)
-        if output_attentions:
-            outputs = outputs + (weights,)
-        return outputs
-
-
-class XLMAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.causal = config.causal
-
-        # dictionary / languages
-        self.n_langs = config.n_langs
-        self.use_lang_emb = config.use_lang_emb
-        self.n_words = config.n_words
-        self.eos_index = config.eos_index
-        self.pad_index = config.pad_index
-
-        # model parameters
-        self.dim = config.emb_dim  # 512 by default
-        self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads  # 8 by default
-        self.n_layers = config.n_layers
-        self.dropout = config.dropout
-        self.attention_dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
-
-        self.attention = MultiHeadAttention(self.n_heads, self.dim, config=config)
-        self.ln = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
-
-
-    def get_masks(self, slen, lengths, causal, padding_mask=None):
-        """
-        Generate hidden states mask, and optionally an attention mask.
-        """
-        alen = torch.arange(slen, dtype=torch.long)
-        if padding_mask is not None:
-            mask = padding_mask
-        else:
-            assert lengths.max().item() <= slen
-            mask = alen < lengths[:, None]
-
-        # attention mask is the same as mask, or triangular inferior attention (causal)
-        bs = lengths.size(0)
-        if causal:
-            attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
-        else:
-            attn_mask = mask
-
-        # sanity check
-        assert mask.size() == (bs, slen)
-        assert causal is False or attn_mask.size() == (bs, slen, slen)
-
-        return mask, attn_mask
-
-    def forward(self, hidden_states):
-        bs, slen = hidden_states.shape[0], hidden_states.shape[1]
-        lengths = torch.tensor([slen] * bs)
-
-        # check inputs
-        assert lengths.size(0) == bs
-        assert lengths.max().item() <= slen
-
-        mask, attn_mask = self.get_masks(slen, lengths, self.causal)
-        attn_outputs = self.attention(
-            hidden_states,
-            attn_mask,
-        )
-        return self.ln(attn_outputs[0])
-
-input_shapes = [(1, 16, 2048)]
-
-def test_tvm_xlm_attention(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    config = XLMConfig()
-
-    model = XLMAttention(config)
-
-    mod = PyTorchModule("XLM_attention", model)
-
-    input_shape = (1, 16, 2048)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"attention.k_lin.bias"},
-        ),
-        uniform_inputs=True,
-    )
-
-def test_tvm_xlm_FFN(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    recompute = True
-
-    config = XLMConfig()
-
-    model = XLMModel(config)
-
-    mod = PyTorchModule("XLM_FFN", model.ffns[0])
-
-    input_shape = (1, 16, 2048)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_B/test_distilbert.py b/pybuda/test/tvm/nlp/pytorch/tests_B/test_distilbert.py
deleted file mode 100644
index 387836e5..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_B/test_distilbert.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import os
-import torch
-import pytest
-from transformers import DistilBertModel
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-
-def test_distilbert_pt(test_kind, test_device):
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class Transformer(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.attn_mask = torch.ones((1, 128))
-            self.module = module
-
-        def forward(self, input_act):
-            return self.module(input_act, self.attn_mask)
-
-    framework_module = download_model(
-        DistilBertModel.from_pretrained,
-        "distilbert-base-cased-distilled-squad"
-    )
-    framework_module = Transformer(framework_module)
-    pybuda_module = PyTorchModule("distilbert_pt", framework_module)
-
-    # Input shapes
-    input_act_shape = (1, 128)
-
-    # Sanity check
-    # act = torch.randint(0, 25000, input_act_shape)
-    # out = framework_module(act)
-
-    verify_module(
-        pybuda_module,
-        (input_act_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.95,
-            waive_gradient_errors={"attention.k_lin.bias"},
-        ),
-        input_params=[{"data_format": torch.int}],
-    )
-
-
-def test_distilbert_layer_pt(test_kind, test_device):
-    pytest.skip("Covered in full DistilBert test")
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class Transformer(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-            self.attn_mask = torch.ones((1, 128))
-
-        def forward(self, input_act):
-            return self.module.transformer.layer[0](input_act, self.attn_mask)
-
-    framework_module = download_model(
-        DistilBertModel.from_pretrained,
-        "distilbert-base-cased-distilled-squad"
-    )
-    framework_module = Transformer(framework_module)
-    pybuda_module = PyTorchModule("distilbert_layer_pt", framework_module)
-
-    # Input shapes
-    input_act_shape = (1, 128, 768)
-
-    # Sanity check
-    # act = torch.rand(input_act_shape)
-    # out = framework_module(act)
-
-    verify_module(
-        pybuda_module,
-        (input_act_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.95,
-            verify_all=True,
-        ),
-    )
-
-
-def test_distilbert_layer_mha_pt(test_kind, test_device):
-    pytest.skip("Covered in full DistilBert test")
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    class Transformer(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-            self.attn_mask = torch.ones((1, 128))
-
-        def forward(self, q_act, k_act, v_act):
-            return self.module.transformer.layer[0].attention(
-                q_act, k_act, v_act, self.attn_mask
-            )
-
-    framework_module = download_model(
-        DistilBertModel.from_pretrained,
-        "distilbert-base-cased-distilled-squad"
-    )
-    framework_module = Transformer(framework_module)
-    pybuda_module = PyTorchModule("distilbert_layer_mha_pt", framework_module)
-
-    # Input shapes
-    inp_shape = (1, 128, 768)
-
-    # Sanity check
-    q_act = torch.rand(inp_shape)
-    k_act = torch.rand(inp_shape)
-    v_act = torch.rand(inp_shape)
-    out = framework_module(q_act, k_act, v_act)
-
-    verify_module(
-        pybuda_module,
-        (inp_shape, inp_shape, inp_shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.95,
-            verify_all=True,
-            waive_gradient_errors={"attention.k_lin.bias"}
-        ),
-    )
-
-
-def test_distilbert_layer_with_embeddings_pt(test_kind, test_device):
-    pytest.skip("Covered in full DistilBert test")
-    # Only run recompute test in post-commit
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    # Test only inference
-    if test_kind.is_training():
-        pytest.skip()
-
-    os.environ["PYBUDA_RELOAD_GENERATED_MODULES"] = "1"
-
-    class Transformer(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.attn_mask = torch.ones((1, 128))
-            self.module = module
-
-        def forward(self, input_act):
-            # return self.module(input_act, self.attn_mask)
-            emb_out = self.module.embeddings(input_act)
-            return self.module.transformer.layer[0](emb_out, self.attn_mask)
-
-    framework_module = download_model(
-        DistilBertModel.from_pretrained,
-        "distilbert-base-cased-distilled-squad"
-    )
-    framework_module = Transformer(framework_module)
-    pybuda_module = PyTorchModule(
-        "distilbert_layer_with_embeddings_pt", framework_module
-    )
-
-    # Input shapes
-    input_act_shape = (1, 128)
-
-    # Sanity check
-    # act = torch.randint(0, 25000, input_act_shape)
-    # out = framework_module(act)
-
-    verify_module(
-        pybuda_module,
-        (input_act_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.95,
-            # waive_gradient_errors={
-            #     "attention/k_lin/bias:0",
-            #     "LayerNorm.weight",
-            #     "LayerNorm.bias",
-            # },
-            verify_all=True,
-        ),
-        input_params=[{"data_format": torch.int}],
-    )
-
-
-def test_distilbert_without_embeddings_pt(test_kind, test_device):
-    pytest.skip("Covered in full DistilBert test")
-    # Test only inference
-    if test_kind.is_training():
-        pytest.skip()
-
-    class Transformer(torch.nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.attn_mask = torch.ones((1, 32))
-            self.module = module
-
-        def forward(self, inputs_embeds):
-            return self.module(None, self.attn_mask, None, inputs_embeds)
-
-    framework_module = download_model(
-        DistilBertModel.from_pretrained,
-        "distilbert-base-cased-distilled-squad", torchscript=True
-    )
-    framework_module = Transformer(framework_module)
-    pybuda_module = PyTorchModule("distilbert_without_embeddings_pt", framework_module)
-
-    # Input shapes
-    input_emb_shape = (1, 32, 768)
-
-    # Sanity check
-    # inputs_embeds = torch.rand(input_emb_shape)
-    # out = framework_module(inputs_embeds)
-
-    verify_module(
-        pybuda_module,
-        (input_emb_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_B/test_wmt.py b/pybuda/test/tvm/nlp/pytorch/tests_B/test_wmt.py
deleted file mode 100644
index cf2546e3..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_B/test_wmt.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-import torch.nn as nn
-from transformers import FSMTModel
-
-from pybuda import (
-    PyTorchModule,
-    VerifyConfig,
-)
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import CompileDepth, _get_global_compiler_config
-
-
-class WMT_Encoder_Wrapper(nn.Module):
-    def __init__(self):
-        super().__init__()
-        # model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', tokenizer='moses', bpe='fastbpe')
-        model = FSMTModel.from_pretrained("facebook/wmt19-en-de", torchscript=True)
-        
-        self.mod = model.encoder.layers[0]
-        self.encoder_padding_mask = torch.zeros((1, 1)).to(torch.bool)
-        self.attn_mask = torch.ones((16,)).to(torch.bool)
-
-    def forward(self, x):
-        return self.mod(x, self.encoder_padding_mask, self.attn_mask)[0]
-
-
-def test_wmt_encoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    submodel = WMT_Encoder_Wrapper()
-    mod = PyTorchModule("wmt16_encoder", submodel)
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    else:
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    # out = model(torch.randint(0, 256, (1, 1), dtype=torch.int32))
-    out = submodel(torch.rand((1, 1, 1024)))
-
-    verify_module(
-        mod,
-        ((1, 1, 1024),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-class WMT_Decoder_Wrapper(nn.Module):
-    def __init__(self):
-        super().__init__()
-        # model = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de', tokenizer='moses', bpe='subword_nmt')
-        model = FSMTModel.from_pretrained("facebook/wmt19-en-de", torchscript=True)
-
-        self.mod = model.decoder.layers[0]
-
-    def forward(self, x):
-        encoder_out = torch.ones(x.shape)
-        encoder_padding_mask = None
-        incremental_state = None
-        prev_self_attn_state = None
-        prev_attn_state = None
-        attn_mask = None
-        attn_padding_mask = None
-        result = self.mod(
-            x,
-            encoder_out,
-            encoder_padding_mask,
-            incremental_state,
-            prev_self_attn_state,
-            prev_attn_state,
-            attn_mask,
-            attn_padding_mask,
-            True,
-        )
-
-        return result[0], result[1]
-
-
-def test_wmt_decoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    submodel = WMT_Decoder_Wrapper()
-
-    mod = PyTorchModule("wmt16_decoder", submodel)
-
-    if test_kind.is_training():
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-    else:
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-
-    verify_module(
-        mod,
-        ((1, 32, 1024),),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_opt.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_opt.py
deleted file mode 100644
index 3a5172f8..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_opt.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-import torch
-from transformers import OPTModel, OPTConfig
-# from transformers.models.opt.modeling_opt import XGLMAttention, ACT2FN
-from pybuda import (
-    PyTorchModule,
-    BackendType,
-    VerifyConfig,
-)
-
-
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_opt_decoder(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    if test_kind.is_training() and test_device.devtype == BackendType.Silicon:
-        pytest.skip()
-
-    configuration = OPTConfig()
-    model = OPTModel(configuration)
-
-    submodel = model.decoder.layers[0]
-    mod = PyTorchModule("OPT_decoder_layer", submodel)
-
-    relative_atol = 0.4 if test_device.devtype == BackendType.Silicon else 0.1
-    pcc = 0.9 if test_device.devtype == BackendType.Silicon else 0.99
-    input_shape = (1, 32, 768)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"self_attn.k_proj.bias"},
-            relative_atol=relative_atol,
-            pcc=pcc,
-        )
-    )
-
-
-def test_opt_full(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    configuration = OPTConfig()
-    configuration.return_dict = False
-    model = OPTModel(configuration)
-
-    submodel = model
-    mod = PyTorchModule("OPT_full", submodel)
-
-    relative_atol = 0.4 if test_device.devtype == BackendType.Silicon else 0.1
-    pcc = 0.9 if test_device.devtype == BackendType.Silicon else 0.99
-    input_shape = (1, 128)
-    inputs = [torch.randint(0, configuration.vocab_size, input_shape)]
-    verify_module(
-        mod,
-        (input_shape,),
-        inputs=[inputs],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"self_attn.k_proj.bias"},
-            relative_atol=relative_atol,
-            pcc=pcc,
-        )
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_roberta.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_roberta.py
deleted file mode 100644
index 10d3faca..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_roberta.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pybuda
-from pybuda.config import CompileDepth
-from pybuda.cpudevice import CPUDevice
-from pybuda.verify.cpueval import TrainingEvalData
-import pytest
-from loguru import logger
-
-
-import torch
-from transformers import RobertaModel, RobertaConfig
-
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.op.eval import compare_tensor_to_golden
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from test.backend.models.test_bert import get_relaxed_atol_pcc
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-class EmbWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model.embeddings
-
-    def forward(
-        self,
-        input_ids,
-        extended_attention_mask,
-    ) -> torch.Tensor:
-        return self.model(input_ids), extended_attention_mask
-
-class RobertaEncoder(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.encoder = model.encoder
-        self.pooler = model.pooler
-
-    def forward(self, embedding_output, extended_attention_mask):
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-        )
-        return encoder_outputs
-
-def test_roberta_pipeline(test_kind, test_device):
-    pytest.skip("Full model passes inference and training")
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        pass#compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        pass# compiler_cfg.compile_depth = CompileDepth.POST_INITIAL_GRAPH_PASS
-
-    config = RobertaConfig()
-    model = RobertaModel(config, add_pooling_layer=False)
-    model.eval()
-
-    roberta_embeddings = EmbWrapper(model)
-    roberta_encoder = RobertaEncoder(model)
-
-
-    cpu0 = CPUDevice("cpu0", module=PyTorchModule("roberta_embeddings", roberta_embeddings))
-    tt1 = TTDevice("tt1", devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("roberta_encoder_stack", roberta_encoder))
-
-    seq_len = 128
-    input_ids = torch.randint(config.vocab_size, (1, seq_len))
-    attention_mask = torch.ones((1, seq_len))
-    extended_attention_mask = model.get_extended_attention_mask(attention_mask, input_ids.size())
-    cpu0.push_to_inputs(input_ids, extended_attention_mask)
-    # tt1.push_to_inputs(input_ids)
-    output_q = pybuda.run_inference()
-    outputs = output_q.get()
-
-    torch_outputs = model(input_ids)
-    assert compare_tensor_to_golden("roberta", torch_outputs[0], outputs[0].value(), is_buda=True)
-
-
-def test_roberta_encoder(test_kind, test_device):
-    pytest.skip("Full model passes training and inference")
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    input_shape = (1, 256, 256)
-    roberta_model = download_model(RobertaModel.from_pretrained, "arampacha/roberta-tiny", torchscript=True)
-    model = roberta_model.encoder
-
-    hidden_states = torch.rand(*input_shape)
-
-    mod = PyTorchModule("roberta_encoder_pytorch", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"layer.0.attention.self.key.bias", "layer.1.attention.self.key.bias", 
-            "layer.2.attention.self.key.bias", "layer.3.attention.self.key.bias"} # small numbers
-        )
-    )
-
-def test_roberta_full(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    input_shape = (1, 128)
-    model = download_model(RobertaModel.from_pretrained, "arampacha/roberta-tiny", torchscript=True)
-    model.pooler = None
-    model.return_dict = False
-
-    class RobertaWrapper(torch.nn.Module):
-        def __init__(self, roberta):
-            super().__init__()
-            self.roberta = roberta
-
-        def forward(self, x):
-            out = self.roberta(x)
-            
-            return [output for output in out if output is not None]
-
-    input_ids = [torch.randint(0, input_shape[-1], input_shape)]
-
-    mod = PyTorchModule("roberta_encoder_pytorch", RobertaWrapper(model))
-
-    verify_module(
-        mod,
-        (input_shape,),
-        inputs=[input_ids],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=0.97,
-            waive_gradient_errors={"layer.0.attention.self.key.bias", "layer.1.attention.self.key.bias", 
-             "layer.2.attention.self.key.bias", "layer.3.attention.self.key.bias"} # small numbers
-        )
-    )
-
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
deleted file mode 100644
index 8e010276..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# TrOCR basic bring-up tests of tracing functionality
-#
-import torch
-import pytest
-from transformers import (
-    TrOCRConfig,
-    TrOCRForCausalLM,
-    ViTConfig,
-    ViTModel,
-    VisionEncoderDecoderModel,
-)
-
-from pybuda import PyTorchModule, VerifyConfig
-from pybuda.verify import verify_module
-from pybuda.config import _get_global_compiler_config
-
-
-def test_trocr_reduced_size(test_kind, test_device):
-    # import os
-    # os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
-    # os.environ["PYBUDA_RELOAD_GENERATED_MODULES"] = "1"
-    
-    # In transformers version update from 4.35.2 to 4.41.0
-    # torch.full operation with bool type fill value is introduced
-    # in MaxLengthCriteria class (used by .generate function)
-    # which results in Jit trace failure
-    # Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2728
-
-    pytest.skip("Skipped due to jit trace issue in FULL op")
-
-    class Module(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, pixel_values):
-            return self.model.generate(pixel_values)
-
-    # Compile configuration
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_cpu_fallback = False
-    compiler_cfg.enable_tm_cpu_fallback = False
-    compiler_cfg.balancer_policy = "Ribbon"
-
-    # Input shape
-    input_shape = (1, 3, 128, 128)
-
-    # Load model
-    encoder_config = ViTConfig()
-    encoder_config.num_attention_heads = 1
-    encoder_config.num_hidden_layers = 1
-    encoder_config.image_size = input_shape[-1]
-    encoder = ViTModel(encoder_config)
-
-    decoder_config = TrOCRConfig()
-    decoder_config.decoder_attention_heads = 1
-    decoder_config.decoder_layers = 1
-    decoder = TrOCRForCausalLM(decoder_config)
-    framework_model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    framework_model = Module(framework_model)
-
-    # Larger variation - also works
-    # # Input shape
-    # input_shape = (1, 3, 128, 128)
-
-    # # Load model
-    # encoder_config = ViTConfig()
-    # encoder_config.num_attention_heads = 4
-    # encoder_config.num_hidden_layers = 4
-    # encoder_config.image_size = input_shape[-1]
-    # encoder = ViTModel(encoder_config)
-
-    # decoder_config = TrOCRConfig()
-    # decoder_config.decoder_attention_heads = 4
-    # decoder_config.decoder_layers = 4
-    # decoder = TrOCRForCausalLM(decoder_config)
-    # framework_model =  VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    # framework_model = Module(framework_model)
-
-    # Sanity check
-    # pixel_values = torch.rand(input_shape)
-    # generated_ids = framework_model(pixel_values)
-
-    verify_module(
-        PyTorchModule("pt_trocr", framework_model),
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            # verify_all=True,
-        ),
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_unispeech.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_unispeech.py
index 4f639fd5..5760598e 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_unispeech.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_C/test_unispeech.py
@@ -26,70 +26,6 @@
 from pybuda.verify.config import TestKind
 from test.utils import download_model
 
-def test_unispeech(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    pytest.skip() # See tenstorrent/pybuda#1935
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"encoder.pos_conv_embed.conv.weight_v"}
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    framework_model = download_model(
-        UniSpeechModel.from_pretrained,
-        "microsoft/unispeech-sat-base", torchscript=True
-    )
-
-    mod = PyTorchModule(
-        "unispeech_full_model",
-        framework_model,
-    )
-
-    input_shape = (1, 512)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
-def test_unispeech_conv_feature_encoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER  # Unsupported HW ops
-
-    framework_model = download_model(
-        UniSpeechModel.from_pretrained,
-        "microsoft/unispeech-sat-base", torchscript=True
-    )
-
-    framework_submodel = framework_model.feature_extractor.conv_layers[0]
-    mod = PyTorchModule(
-        "unispeech_conv_feature_encoder",
-        framework_submodel,
-    )
-
-    input_shape = (1, 1, 512)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
 def test_unispeech_feature_projection(test_kind, test_device):
     if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
         pytest.skip()
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_wav2vec2.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_wav2vec2.py
index eb5614ed..35eac559 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_wav2vec2.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_C/test_wav2vec2.py
@@ -27,70 +27,6 @@
 from test.utils import download_model
 
 
-def test_wav2vec2(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    pytest.skip() # See tenstorrent/pybuda#1935
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"encoder.pos_conv_embed.conv.weight_v"}
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH  
-
-    framework_model = download_model(
-        Wav2Vec2Model.from_pretrained,
-        "facebook/wav2vec2-base", torchscript=True
-    )
-
-    mod = PyTorchModule(
-        "wav2vec2_full_model",
-        framework_model,
-    )
-
-    input_shape = (1, 512)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            verify_all=True,
-        )
-    )
-
-
-
-def test_wav2vec2_base_conv_feature_encoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    framework_model = download_model(
-        Wav2Vec2Model.from_pretrained,
-        "facebook/wav2vec2-base", torchscript=True
-    )
-
-    framework_submodel = framework_model.feature_extractor.conv_layers[0]
-    mod = PyTorchModule(
-        "wav2vec2_conv_feature_encoder",
-        framework_submodel,
-    )
-
-    input_shape = (1, 1, 512)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
 # TODO: Increase batch dim when possible
 def test_wav2vec2_base_transformer_encoder(test_kind, test_device):
 
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
index 3a4566fb..21a27a59 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
@@ -59,35 +59,6 @@ def test_bloom_model_transposed(test_kind, test_device):
         )
     )
 
-def test_bloom_model(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    model = Transformer()
-    compiler_cfg = _get_global_compiler_config()
-
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    else:
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS  # Unsupported HW ops
-
-    submodel = model
-    mod = PyTorchModule("bloom_encoder", submodel)
-
-    input_shape = (1, 32, 128)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
-
 def test_bloom_hf(test_kind, test_device):
     if test_kind.is_training():
         # output mismatch
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
deleted file mode 100644
index b3298fc7..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
+++ /dev/null
@@ -1,630 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import torch
-from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-from transformers.pytorch_utils import Conv1D
-from test.backend.models.test_bert import get_relaxed_atol_pcc
-
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    VerifyConfig,
-    run_generate,
-)
-import pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-from typing import Optional, Tuple, Union
-import math
-from loguru import logger
-
-import os
-
-input_shapes = [(1, 64, 768)]
-
-
-def test_pt_gpt2_tokengen():
-    torch.set_printoptions(linewidth=200)
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    tokenizer.pad_token = tokenizer.eos_token
-
-    pad_token = tokenizer(tokenizer.pad_token)["input_ids"][0]
-
-    past_cache_length = 30
-    run_length = 6
-
-    total_length = past_cache_length + run_length
-    inputs = tokenizer("The", max_length=past_cache_length, pad_to_max_length=True, truncation=True, return_tensors="pt")
-    input_ids = torch.tensor(inputs["input_ids"])
-    input_ids[:][:] = pad_token
-    past_key_values = model(input_ids, return_dict=False)[1]
-
-    next_key_values = past_key_values
-    past_key_values = list(next_key_values)
-    for block_idx, block in enumerate(next_key_values):
-        past_key_values[block_idx] = list(block)
-        for kv_idx, kv in enumerate(block):
-            past_key_values[block_idx][kv_idx] = torch.zeros_like(kv)
-
-    prefix_text = "My name is Bert,"
-    inputs = tokenizer(prefix_text, max_length=run_length, pad_to_max_length=True, truncation=True, return_tensors="pt")
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-
-    last_prefix_token =  int((attention_mask[0] == 0).nonzero()[0][0]) - 1
-    tokens_to_generate = 30
-
-    full_attention_mask = torch.zeros(total_length).int().unsqueeze(0)
-    full_attention_mask[:,past_cache_length:] = torch.tensor(inputs["attention_mask"]).int()
-
-    all_generated_tokens = []
-
-    past_length = 0
-    print("")
-    for i in range(last_prefix_token, last_prefix_token + tokens_to_generate):
-        position_ids = torch.arange(past_length, past_length + run_length)
-        lm_head_out, next_key_values = model(input_ids, attention_mask=full_attention_mask, past_key_values=past_key_values, position_ids=position_ids, return_dict=False)
-
-        next_token = torch.argmax(lm_head_out, dim=-1)[0][i % run_length]
-        all_generated_tokens.append(next_token)
-
-        next_token_index = (i + 1) % run_length
-        if next_token_index == 0:
-            tile_index = ((i + 1) // run_length) - 1
-            past_length += run_length
-            input_ids[:][:] = pad_token
-            pce = i + 1
-            pcb = 0
-            full_attention_mask[:, pcb:pce] = 1
-            full_attention_mask[:, -run_length:] = 0
-            print(full_attention_mask)
-
-            past_key_values = list(next_key_values)
-            for block_idx, block in enumerate(next_key_values):
-                past_key_values[block_idx] = list(block)
-                for kv_idx, kv in enumerate(block):
-                    past_key_values[block_idx][kv_idx][:, :, tile_index*run_length:(tile_index+1)*run_length, :] = kv[:, :, -run_length:, :]
-                    past_key_values[block_idx][kv_idx] = past_key_values[block_idx][kv_idx].narrow(2, 0, past_cache_length)
-            print(abs(past_key_values[0][0][:, 0, :, 0].int()))
-        input_ids[0][next_token_index] = next_token
-        full_attention_mask[0][past_cache_length + next_token_index] = 1
-
-    generated_text_pt = tokenizer.decode(all_generated_tokens)
-    logger.info(f"PT Generated text: {generated_text_pt}")
-
-def test_tvm_gpt2_attention_with_past_cache(test_device):
-    hidden_size = 768
-    num_heads = 12
-    max_seq_len = 512
-    
-    class AttentionWrapper(torch.nn.Module):
-        def __init__(self, attn):
-            super().__init__()
-            self.attn = attn
-        
-        def forward(self, hidden_states, attention_mask, key_past, value_past):
-            key_past = self.attn._split_heads(key_past, 12, 64)
-            value_past = self.attn._split_heads(value_past, 12, 64)
-            layer_past = (key_past, value_past)
-            hidden_states, (key_past, value_past) = self.attn(hidden_states, layer_past=layer_past, use_cache=True, attention_mask=attention_mask)
-            key_present = key_past[:, :, -32:, :]
-            key_present = self.attn._merge_heads(key_present, 12, 64)
-            value_present = value_past[:, :, -32:, :]
-            value_present = self.attn._merge_heads(value_present, 12, 64)
-            return hidden_states, key_present, value_present
-    
-    torch.manual_seed(52)
-    torch_mod = AttentionWrapper(GPT2Model.from_pretrained("gpt2").h[0].attn)
-    layer_past_shape = (1, 480, 768)
-    mod = PyTorchModule("gpt2_cached_attn", torch_mod)
-
-    hidden_states_shape = (1, 32, hidden_size)
-    attention_mask_shape = (1, 1, 1, max_seq_len)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.loopback_outputs = {"tensor_1": 1, "tensor_5": 2}
-
-    tt0 = TTDevice("tt0", devtype=test_device.devtype)
-    tt0.place_module(mod)
-    output_q = pybuda.initialize_pipeline(training=False, sample_inputs=(torch.rand(hidden_states_shape), torch.rand(attention_mask_shape), torch.zeros(layer_past_shape), torch.zeros(layer_past_shape)), _verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            verify_all=True,
-        ))
-    print(pybuda.get_parameter_checkpoint()[0]['tensor_1'])
-    tt0.push_to_inputs((torch.rand(hidden_states_shape), torch.rand(attention_mask_shape), ))
-    pybuda.run_generate(input_count=1, write_index=0)
-    pk = pybuda.get_parameter_checkpoint()[0]['tensor_1'].value()
-    ans = output_q.get(timeout = 0.5)
-    print(pybuda.get_parameter_checkpoint()[0]['tensor_1'])
-    tt0.push_to_inputs((torch.rand(hidden_states_shape), torch.rand(attention_mask_shape), ))
-    pybuda.run_generate(input_count=1, write_index=1)
-    print(pybuda.get_parameter_checkpoint()[0]['tensor_1'])
-    tt0.push_to_inputs((torch.rand(hidden_states_shape), torch.rand(attention_mask_shape), ))
-    pybuda.run_generate(input_count=1, write_index=2)
-    print(pybuda.get_parameter_checkpoint()[0]['tensor_1'])
-
-class EmbWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.gpt2 = model.transformer
-
-    def forward(self, input_ids, attention_mask, position_ids, *kv):
-        inputs_embeds = self.gpt2.wte(input_ids)
-        position_embeds = self.gpt2.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-        attention_mask = attention_mask.unsqueeze(0).unsqueeze(0)
-        extended_attention_mask = (1.0 - attention_mask) * -10000.0
-        return hidden_states, extended_attention_mask, *kv
-
-class BlocksWrapper(torch.nn.Module):
-    def __init__(self, model, num_blocks):
-        super().__init__()
-        self.gpt2 = model.transformer
-        self.num_blocks = num_blocks
-
-    def forward(self, hidden_states, extended_attention_mask, *kv):
-        presents = []
-        for i, block in enumerate(self.gpt2.h[:self.num_blocks]):
-            past_key = kv[i * 2]
-            past_value = kv[(i * 2) + 1]
-            past_key = self.gpt2.h[0].attn._split_heads(past_key, 12, 64)
-            past_value = self.gpt2.h[0].attn._split_heads(past_value, 12, 64)
-            layer_past = (past_key, past_value)
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=extended_attention_mask,
-                use_cache=True,
-            )
-            hidden_states = outputs[0]
-            key_present = outputs[1][0][:, :, -32:, :]
-            key_present = self.gpt2.h[0].attn._merge_heads(key_present, 12, 64)
-            presents.append(key_present)
-            value_present = outputs[1][1][:, :, -32:, :]
-            value_present = self.gpt2.h[0].attn._merge_heads(value_present, 12, 64)
-            presents.append(value_present)
-        hidden_states = self.gpt2.ln_f(hidden_states)
-        return hidden_states, *presents
-
-class LMHeadWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.lm_head = model.lm_head
-
-    def forward(self, hidden_states, *kv):
-        return self.lm_head(hidden_states)
-
-def test_gpt2_past_cache(test_device):
-    if not test_device.is_silicon():
-        pytest.skip() # too long for post-commit
-
-    compiler_cfg = _get_global_compiler_config()
-    num_blocks = 12
-    if num_blocks == 12:
-        compiler_cfg.loopback_outputs = {"tensor_1": 1, "tensor_5": 2,
-                                        "tensor_33": 3, "tensor_37": 4,
-                                        "tensor_65": 5, "tensor_69": 6,
-                                        "tensor_97": 7, "tensor_101": 8,
-                                        "tensor_129": 9, "tensor_133": 10,
-                                        "tensor_161": 11, "tensor_165": 12,
-                                        "tensor_193": 13, "tensor_197": 14,
-                                        "tensor_225": 15, "tensor_229": 16,
-                                        "tensor_257": 17, "tensor_261": 18,
-                                        "tensor_289": 19, "tensor_293": 20,
-                                        "tensor_321": 21, "tensor_325": 22,
-                                        "tensor_353": 23, "tensor_357": 24}
-    else:
-        compiler_cfg.loopback_outputs = {"past_key_1": 1, "past_value_1": 2, "past_key": 3, "past_value": 4}
-
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    tokenizer.pad_token = tokenizer.eos_token
-
-    embeddings = EmbWrapper(model)
-    blocks = BlocksWrapper(model, num_blocks=num_blocks)
-    lm_head = LMHeadWrapper(model)
-
-
-    past_length = 0
-    run_length = 32
-    pad_token = tokenizer(tokenizer.pad_token)["input_ids"][0]
-    prefix_text = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains."
-    inputs = tokenizer(prefix_text, max_length=run_length, pad_to_max_length=True, truncation=True)
-    prefix_text = ""
-    input_ids_tt = torch.tensor(inputs["input_ids"]).int().unsqueeze(0)
-    attention_mask = torch.tensor(inputs["attention_mask"]).int().unsqueeze(0)
-    attention_mask = torch.cat((torch.zeros(1, 480), attention_mask), -1)
-    position_ids = torch.arange(past_length, past_length + run_length)
-
-    last_prefix_token =  inputs["attention_mask"].index(0) - 1 
-    tokens_to_generate = 480
-
-    cpu0 = pybuda.CPUDevice("cpu0", module=PyTorchModule("gpt2_embeddings", embeddings))
-    tt1 = pybuda.TTDevice("tt1", 
-            devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("gpt2_blocks", blocks))
-    cpu1 = pybuda.CPUDevice("cpu1", module=PyTorchModule("gpt2_lm_head", lm_head))
-
-    layer_past_shape = (1, 480, 768)
-    inputs = (input_ids_tt, attention_mask, position_ids)
-    for _ in range(num_blocks):
-        inputs += (torch.zeros(layer_past_shape), torch.zeros(layer_past_shape))
-
-    output_q = pybuda.initialize_pipeline(training=False, sample_inputs=inputs,)
-    write_index = 0
-    current_token_index = last_prefix_token
-    for i in range(tokens_to_generate):
-        position_ids = torch.arange(past_length, past_length + run_length)
-        cpu0.push_to_inputs((input_ids_tt, attention_mask, position_ids))
-        pybuda.run_generate(input_count=1, write_index=write_index)
-        outputs = output_q.get()
-        lm_head_out = outputs[0].value().detach()
-        k = 10
-        top_k_probs, top_k_ids = torch.topk(lm_head_out[0,current_token_index], k=k)
-        next_token = top_k_ids[torch.randint(k-1, (1, ))]
-        # next_token = torch.argmax(lm_head_out, dim=-1)[0][current_token_index]
-        current_token_index += 1
-        if current_token_index == 32:
-            past_length += run_length
-            current_token_index = 0
-            attention_mask[0][write_index * 32 : (write_index + 1) * 32] = 1
-            attention_mask[0][-32:] = 0
-            write_index += 1
-            prefix_text += tokenizer.decode(input_ids_tt[0][:].numpy().tolist())
-            input_ids_tt[:][:] = pad_token
-        input_ids_tt[0][current_token_index] = next_token
-        scrubbed_input = input_ids_tt[input_ids_tt != pad_token]
-        print(f"Generated text: {tokenizer.decode(scrubbed_input.numpy().tolist())}")
-        attention_mask[0][480 + current_token_index] = 1
-
-    print(f"Generated text: {prefix_text}")
-    
-    # prefix_text = "My name is Ljubisa, and I am"
-    # inputs = tokenizer(prefix_text, max_length=64, pad_to_max_length=True, truncation=True)
-    # input_ids_pt = torch.tensor(inputs["input_ids"]).int().unsqueeze(0)
-    # attention_mask = torch.tensor(inputs["attention_mask"]).int().unsqueeze(0)
-
-    # for i in range(tokens_to_generate):
-    #     embedding_output = embeddings(input_ids_pt, attention_mask)
-    #     model_output = blocks(*embedding_output)
-    #     lm_head_out = lm_head(model_output)
-    #     next_token = torch.argmax(lm_head_out, dim=-1)[0][last_prefix_token + i]
-    #     next_token_index = last_prefix_token + i + 1
-    #     input_ids_pt[0][next_token_index] = next_token
-    #     attention_mask[0][next_token_index] = 1
-
-def test_tvm_past_cache_generate(test_device):
-    class PastCache(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-        
-        def forward(self, y, key_past):
-            key_past = key_past + y
-            key_past_sliced = key_past[:, :, -32:, :]
-            return key_past_sliced, key_past_sliced + 1
-    
-    torch_mod = PastCache()
-    mod = PyTorchModule("cached_attn", torch_mod)
-    tt0 = TTDevice("tt0", devtype=test_device.devtype)
-    tt0.place_module(mod)
-
-
-    single_cache_line = (1, 1, 32, 32)
-    layer_past_shape =  (1, 1, 192, 32)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.loopback_outputs = {"key_past_1": 0}
-
-    output_q = pybuda.initialize_pipeline(training=False, sample_inputs=(torch.rand(1), torch.zeros(layer_past_shape)), _verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-        ))
-    print(pybuda.get_parameter_checkpoint())
-    tt0.push_to_inputs((torch.rand(1), ))
-    tt0.push_to_inputs((torch.rand(1), ))
-    pybuda.run_generate(input_count=2, tokens_per_iter=32, token_id=0)
-    print(pybuda.get_parameter_checkpoint())
-    ans = output_q.get()
-    tt0.push_to_inputs((torch.rand(1), ))
-    pybuda.run_generate(input_count=1, tokens_per_iter=31, token_id=64)
-    ans = output_q.get()
-    print(pybuda.get_parameter_checkpoint())
-    tt0.push_to_inputs((torch.rand(1), ))
-    tt0.push_to_inputs((torch.rand(1), ))
-    pybuda.run_generate(input_count=2, tokens_per_iter=1, token_id=95)
-    ans = output_q.get()
-    print(pybuda.get_parameter_checkpoint())
-
-
-def test_past_cache_prefill_generate(test_device):
-    class PastCache_attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.mm = torch.nn.Parameter(torch.rand(32, 32))
-        
-        def forward(self, input_prefill):
-            out = torch.matmul(input_prefill, self.mm)
-            return out 
-
-    
-    class PastCache_prefill(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-        
-        def forward(self, input_gen, prefill_output):
-
-            key_past = input_gen + prefill_output
-            key_past_sliced = key_past[:, :, -32:, :]
-            return key_past_sliced
-    torch_mod_0 = PastCache_attn()
-    torch_mod_1 = PastCache_prefill()
-
-    mod_0 = PyTorchModule("cached_attn", torch_mod_0)
-    mod_1 = PyTorchModule("cached_prefill", torch_mod_1)
-
-    tt0 = TTDevice("tt0", devtype=test_device.devtype)
-    tt0.place_module(mod_0)
-    tt0.place_module(mod_1)
-
-
-    input_prefil_shape = (1, 1, 480, 32)
-    input_generate_shape = (1,)
-    past_shape = (1, 1, 480, 32)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_subgraphs = True
-    compiler_cfg.balancer_op_override("matmul_3_output_nop_0", "t_stream_shape", (15,1))
-    # compiler_cfg.loopback_outputs = {"prefill_output": (0, 1)}
-
-    output_q = pybuda.initialize_pipeline(
-            training=False,
-            sample_inputs=((torch.rand(input_prefil_shape),), (torch.rand(input_generate_shape), torch.rand(past_shape),),), 
-            _verify_cfg=VerifyConfig(
-                arch=test_device.arch,
-                devtype=test_device.devtype,
-            ).disabled()
-        )
-    tt0.set_active_subgraph(0)
-    tt0.push_to_inputs((torch.rand(input_prefil_shape), ))
-    pybuda.run_forward()
-
-    tt0.set_active_subgraph(1)
-    tt0.push_to_inputs((torch.rand(input_generate_shape), torch.rand(past_shape),))
-    pybuda.run_forward()
-
-
-@pytest.mark.skip(reason="Tested with fallback")
-def test_tvm_gpt2_block(test_kind, test_device):
-    # Training without TVM constant prop will result in the following error in placer
-    #   RuntimeError: trying to place bw_in0_gpt2_block.attn_c_attn_weight_combine_add_0_transpose_nop 
-    #   and exceeded max-placement-attempts: grid_shape: (2, 8, original context.start=(.row=5, .col = 5)
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    if test_kind.is_training():
-        test_device.devtype = BackendType.NoBackend
-
-    model = GPT2Model.from_pretrained("gpt2")
-    mod = PyTorchModule("gpt2_block", model.h[0])
-    input_shape = (1, 64, 768)
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"attn.c_attn.weight", "attn.c_attn.bias"}
-    
-    relative_atol = 0.4 if test_device.devtype == BackendType.Silicon else 0.1
-    pcc = 0.9 if test_device.devtype == BackendType.Silicon else 0.99
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"c_attn.bias_1"},
-            relative_atol=relative_atol,
-            pcc=pcc,
-        )
-    )
-
-@pytest.mark.skip(reason="Tested with fallback")
-def test_tvm_gpt2_blocks(test_device):
-    class ListWrapper(torch.nn.Module):
-        def __init__(self, module_list):
-            super().__init__()
-            self.module_list = module_list
-
-        def forward(self, hidden_states):
-            for module in self.module_list:
-                hidden_states = module(hidden_states)[0]
-
-            return hidden_states
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"attn.c_attn.weight", "attn.c_attn.bias"}
-
-    input_shape = (1, 64, 768)
-    model = GPT2Model.from_pretrained("gpt2")
-
-    torch_mod = ListWrapper(model.h)
-    mod = PyTorchModule("gpt2", torch_mod)
-
-    torch.manual_seed(42)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        ),
-        uniform_inputs=True,
-    )
-
-def test_new_gelu(test_device):
-    class NewGELUActivation(torch.nn.Module):
-        """
-        Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-        the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-        """
-        def forward(self, input):
-            return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
-
-    torch_mod = NewGELUActivation()
-    mod = PyTorchModule("new_gelu", torch_mod)
-
-    input_shape = (1, 64, 3072)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        )
-    )
-
-def test_gelu(test_device):
-    class GELUActivation(torch.nn.Module):
-        """
-        Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-        the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-        """
-        def forward(self, input):
-            return torch.nn.functional.gelu(input)
-
-    torch_mod = GELUActivation()
-    mod = PyTorchModule("gelu", torch_mod)
-
-    input_shape = (1, 64, 3072)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-        )
-    )
-
-
-from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-def test_tvm_gpt2_fallback(test_kind, test_device):
-
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"attn.c_attn.weight", "attn.c_attn.bias"} 
-
-    input_shape = (1, 768)
-   
-    config = GPT2Config.from_pretrained("gpt2")
-    config.num_hidden_layers = 2
-    # config.use_cache = False
-    config.return_dict = False
-
-    model = GPT2LMHeadModel(config)
-
-    mod = PyTorchModule("gpt2", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"c_attn.bias"},
-        ),
-        input_params=[{"requires_grad": False, "data_format": torch.int}],
-    )
-
-from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-def test_tvm_gpt2_lmhead(test_kind, test_device):
-    class LMHeadWrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, hidden_states):
-            hidden_states = model.transformer.h[0](hidden_states)[0]
-            lm_logits = self.model.lm_head(hidden_states)
-
-            return lm_logits
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.tvm_constnat_prop_mask={"attn.c_attn.weight", "attn.c_attn.bias"}
-
-
-    input_shape = input_shape = (1, 64, 768)
-   
-    config = GPT2Config.from_pretrained("gpt2")
-    config.num_hidden_layers = 1
-    config.use_cache = False
-    config.return_dict = False
-
-
-    model = GPT2LMHeadModel(config)
-
-    mod = PyTorchModule("gpt2", LMHeadWrapper(model))
-    
-    relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"c_attn.bias"},
-            relative_atol=relative_atol,
-            pcc=pcc,
-        ),
-        uniform_inputs=True,
-    )
-
-class SpliceUnit(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-    def forward(self, cache, line):
-        # cache: 1 x 1 x 32 x 32
-        # line:  1 x 1 x 32 x 1
-        x = cache[...,1:]
-        out = torch.cat((x, line), dim=-1)
-        return out
-
-def test_splice(test_device):
-    import pybuda
-    mod = SpliceUnit()
-    pb_mod = pybuda.PyTorchModule('splice', mod)
-
-    verify_module(pb_mod, [(1, 1, 32, 32), (1, 1, 32, 1)],
-            VerifyConfig(test_kind=TestKind.INFERENCE,
-                arch=test_device.arch,
-                devtype=test_device.devtype,
-            ),
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
index 15e0b24f..860ed007 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
@@ -30,36 +30,7 @@
 from pybuda.verify.config import TestKind
 import pybuda
 
-def test_gptj_block(test_kind, test_device):
-    if test_device.arch == pybuda.BackendDevice.Grayskull:
-        pytest.skip()
-    input_shape = (1, 128, 4096)
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    if test_device.arch == BackendDevice.Wormhole_B0 or test_device.arch == BackendDevice.Blackhole:
-        pytest.skip() # see tenstorrent/pybuda#969
-
-    #Fusing disabled due to tenstorrent/pybuda#789
-    if (test_kind == TestKind.INFERENCE):
-        compiler_cfg.enable_auto_fusing=False
 
-    config = GPTJConfig(n_layer=1)  # for faster loading
-    config.rotary_dim = 64
-    model = GPTJBlock(config)
-
-    mod = PyTorchModule("gptj_block", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
 
 
 def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptneo.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptneo.py
deleted file mode 100644
index 9acb9b85..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptneo.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# GPT Neo basic bring-up tests of tracing functionality
-#
-from pybuda._C.backend_api import BackendDevice
-import pytest
-
-import torch
-from transformers import GPTNeoModel, GPTNeoConfig
-import os
-
-from pybuda import (
-    PyTorchModule,
-    CompileDepth,
-    VerifyConfig,
-    BackendType,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_gptneo_block(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    torch.manual_seed(52)
-    input_shape = (1, 64, 2560)
-    config = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B", torchscript=True)
-    config.num_layers = 1  # For faster model loading
-    model = GPTNeoModel(config)
-    submodel = model.h[0]
-    mod = PyTorchModule("gptneo_block", submodel)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        uniform_inputs=True,
-    )
-
-def test_gptneo_full(test_kind, test_device):
-
-    # Pipegen error on silicon if enabled
-    os.environ["PYBUDA_DISABLE_STABLE_SOFTMAX"] = "1"
-    os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "100000"
-    
-    if test_kind == TestKind.TRAINING:
-        pytest.skip()
-    
-    compiler_cfg = _get_global_compiler_config() 
-    compiler_cfg.balancer_policy = "CNN"
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER 
-
-    #Fusing disabled due to tenstorrent/pybuda#789
-    if test_kind == TestKind.INFERENCE and test_device.arch == BackendDevice.Wormhole_B0:
-        compiler_cfg.enable_auto_fusing=False
-
-    torch.manual_seed(52)
-    input_shape = (1, 256)
-    inputs = [torch.randint(0, input_shape[-1], input_shape)]
-    config = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B", torchscript=True)
-    config.num_layers = 1  # For faster model loading
-    model = GPTNeoModel(config)
-    mod = PyTorchModule("gptneo_full", model)
-
-    pcc = 0.96 if test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        mod,
-        (input_shape,),
-        inputs=[inputs],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            pcc=pcc,
-        ),
-        uniform_inputs=True,
-    )
-
-    os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "0"
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_nbeats.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_nbeats.py
deleted file mode 100644
index d85e871c..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_nbeats.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-from pybuda.config import CompileDepth
-from pytorch_forecasting.models.nbeats.sub_modules import NBEATSBlock, NBEATSGenericBlock, NBEATSTrendBlock, NBEATSSeasonalBlock
-
-import torch
-import pytest
-
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_nbeats_block(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    input_shape = (1, 64, 64, 64)
-    model = NBEATSBlock(100, 100, backcast_length=input_shape[-1])
-
-    mod = PyTorchModule("nbeats_block", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
-
-def test_tvm_nbeats_generic_block(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-
-    input_shape = (1, 64, 64, 64)
-
-    model = NBEATSGenericBlock(100, 100, backcast_length=input_shape[-1])
-
-    mod = PyTorchModule("nbeats_generic_block", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-def test_tvm_nbeats_seasonal_block(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    
-    input_shape = (1, 64, 64, 64)
-
-    class NBeatsSeasonal(NBEATSSeasonalBlock):
-        def __init__(
-            self,
-            units,
-            thetas_dim,
-            backcast_length
-        ):
-            super(). __init__(
-                units,
-                thetas_dim=thetas_dim,
-                backcast_length=backcast_length,
-            )
-
-        def forward(self, x):
-            x = super(NBEATSSeasonalBlock, self).forward(x)
-            amplitudes_backward = self.theta_b_fc(x)
-            backcast = amplitudes_backward.matmul(self.S_backcast)
-            amplitudes_forward = self.theta_f_fc(x)
-            forecast = amplitudes_forward.matmul(self.S_forecast)
-
-            return backcast, forecast
-
-    model = NBeatsSeasonal(100, 100, backcast_length=input_shape[-1])
-
-    mod = PyTorchModule("nbeats_seasonal_block", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
-def test_tvm_nbeats_trend_block(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    input_shape = (1, 64, 64, 64)
-
-    class NBeatsTrend(NBEATSTrendBlock):
-        def __init__(
-            self,
-            units,
-            thetas_dim,
-            backcast_length
-        ):
-            super(). __init__(
-                units,
-                thetas_dim=thetas_dim,
-                backcast_length=backcast_length,
-            )
-
-        def forward(self, x):
-            x = super(NBEATSTrendBlock, self).forward(x)
-            backcast = self.theta_b_fc(x).matmul(self.T_backcast)
-            forecast = self.theta_f_fc(x).matmul(self.T_forecast)
-            return backcast, forecast
-
-    model = NBeatsTrend(100, 100, backcast_length=input_shape[-1])
-
-    mod = PyTorchModule("nbeats_trend_block", model)
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
\ No newline at end of file
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_squeeze_bert.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_squeeze_bert.py
deleted file mode 100644
index 4cc7f35d..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_squeeze_bert.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-
-import torch
-import torch.nn as nn
-# from transformers.models.squeezebert import SqueezeBertEncoder
-from transformers import SqueezeBertModel, SqueezeBertConfig
-
-import math
-import itertools
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_SqueezeBertEncoder(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER 
-
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    input_shape = (1, 32, 768)
-
-    config = SqueezeBertConfig()
-    config.num_hidden_layers = 1
-    model = SqueezeBertModel(config)
-
-    mod = PyTorchModule("SqueezeBertEncoder", model.encoder)
-
-    attention_mask = torch.ones(input_shape[0:2])
-    extended_attn_mask = model.get_extended_attention_mask(attention_mask, input_shape[0:2], "cpu")
-    verify_module(
-        mod,
-        (input_shape, extended_attn_mask.shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
-
-
-
-def test_tvm_SqueezeBertPooler(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER 
-
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    config = SqueezeBertConfig()
-
-    model = SqueezeBertModel(config)
-
-    mod = PyTorchModule("SqueezeBertPooler", model.pooler)
-
-    input_shape = (1, 8, 768)
-
-    verify_module(
-        mod,
-        (input_shape, ),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_xglm.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_xglm.py
deleted file mode 100644
index ba8429c9..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_xglm.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-from codeop import Compile
-from colorama import Back
-from pybuda.config import CompileDepth
-import pytest
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from transformers import XGLMModel, XGLMConfig
-from transformers.models.xglm.modeling_xglm import XGLMAttention, ACT2FN
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.op.eval import compare_tensor_to_golden
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-# Dont use cache for now
-class XGLMDecoderLayer(nn.Module):
-    def __init__(self, config: XGLMConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-
-        self.self_attn = XGLMAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-
-        if config.add_cross_attention:
-            self.encoder_attn = XGLMAttention(
-                embed_dim=self.embed_dim,
-                num_heads=config.attention_heads,
-                dropout=config.attention_dropout,
-                is_decoder=True,
-            )
-            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
-        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    # DONT USE CACHE FOR NOW
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> torch.Tensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                *(encoder_attention_heads,)*.
-            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size *(decoder_attention_heads,)*.
-            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-            hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = residual + hidden_states
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-def test_xglm(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    # compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    configuration = XGLMConfig()
-    model = XGLMDecoderLayer(configuration)
-
-    submodel = model
-    mod = PyTorchModule("XGLMDecoderLayer", submodel)
-
-    input_shape = (1, 32, 1024)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"self_attn.k_proj.bias"},
-        )
-    )
-
-
-def test_xglm_full(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.cpu_fallback_ops.add("take")
-
-    configuration = XGLMConfig()
-    configuration.num_layers = 4
-    configuration.return_dict = False 
-    model = XGLMModel(configuration)
-
-    class XGLMModelWrapper(nn.Module):
-        def __init__(self, xglm):
-            super().__init__()
-            self.xglm = xglm
-
-        def forward(self, x):
-            out = self.xglm(x)
-            return out[0]
-
-    mod = PyTorchModule("XGLMModel", XGLMModelWrapper(model))
-
-    input_shape = (1, 64)
-    inputs = [torch.randint(0, input_shape[-1], input_shape)]
-    pcc = 0.9 if test_kind.is_training() and test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        mod,
-        (input_shape,),
-        inputs=[inputs],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"self_attn.k_proj.bias"},
-            pcc=pcc,
-        ),
-        input_params=[{"requires_grad": False}]
-    )
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py b/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
deleted file mode 100644
index d32ec8ed..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from transformers import WhisperConfig, WhisperModel, AutoProcessor, AutoFeatureExtractor, WhisperProcessor
-
-import pytest
-
-import torch
-from pybuda import (
-    PyTorchModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-)
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from test.utils import download_model
-
-from loguru import logger
-from datasets import load_dataset
-from transformers import AutoTokenizer, CodeGenForCausalLM
-
-
-def test_codegen_single_layer_fallback(test_kind, test_device):
-
-    if test_kind.is_training():
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.retain_tvm_python_files = True
-
-    framework_model = download_model(CodeGenForCausalLM.from_pretrained, "Salesforce/codegen-350M-mono", use_cache=False, n_layer=1, return_dict=False)
-
-    class CodegenTransformer(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, input_ids, attention_mask, ):
-            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask,return_dict=False)
-            return outputs
-    mod = PyTorchModule("CodegenTransformer", CodegenTransformer(framework_model.transformer))
-    input_shape = (1, 5)
-    input_ids = torch.tensor([[ 4299, 23748,    62,  6894, 33529]])
-    input_shape2 = (1, 5)
-    attention_mask = torch.ones(input_shape2)
-
-    verify_module(
-        mod,
-        (input_shape, input_shape2,),
-        inputs=[(input_ids, attention_mask,)],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            verify_pybuda_codegen_vs_framework=True,
-            run_golden=True,
-            pcc=0.98
-        ),
-        input_params=[
-            {"requires_grad": False, "data_format": torch.int},
-            {"requires_grad": False, "data_format": torch.int},
-        ],
-    )
-
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py b/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py
deleted file mode 100644
index c2ce0a26..00000000
--- a/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import pytest
-
-import torch
-from transformers import (
-    AutoProcessor,
-    WhisperForConditionalGeneration,
-    WhisperConfig,
-    WhisperTokenizer,
-    WhisperFeatureExtractor,
-    LogitsProcessorList
-)
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-
-import pybuda
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda._C.backend_api import BackendType
-from pybuda import PyTorchModule, VerifyConfig
-from pybuda.config import _get_global_compiler_config
-from test.utils import download_model
-
-from pybuda.pybudaglobal import TILE_DIM
-
-variants = [
-    "openai/whisper-tiny",
-    # "openai/whisper-base",
-    # "openai/whisper-small",
-    # "openai/whisper-medium",
-    # "openai/whisper-large",
-]
-
-
-@pytest.mark.parametrize("variant", variants, ids=variants)
-def test_whisper_encoder(test_device, variant):
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.amp_level = 1
-    compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    pcc = 0.93 if test_device.devtype == BackendType.Silicon else 0.99
-
-    if variant == "openai/whisper-small" or variant == "openai/whisper-medium" or variant == "openai/whisper-large":
-        os.environ["PYBUDA_PAD_MM"] = "{47:48}"
-
-    class Wrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, input_features):
-            enc_out = self.model.model.encoder(
-                input_features
-            )
-
-            return enc_out[0]
-
-    # Load model (with tokenizer and feature extractor)
-    processor = download_model(AutoProcessor.from_pretrained, variant)
-    framework_model = download_model(
-        WhisperForConditionalGeneration.from_pretrained,
-        variant,
-        use_cache=False,
-        return_dict=False,
-    )
-    
-    framework_model = Wrapper(framework_model)
-    pybuda_model = PyTorchModule("pt_whisper", framework_model)
-
-    # Load and preprocess sample audio
-    sample = torch.load("pybuda/test/model_demos/utils/nlp/pytorch/1272-128104-0000.pt")
-    sample_audio = sample["audio"]["array"]
-
-    inputs = processor(sample_audio, return_tensors="pt")
-    input_features = inputs.input_features
-
-    # Sanity run
-    out = framework_model(input_features)
-
-    verify_module(
-        pybuda_model,
-        [
-            (input_features.shape),
-        ],
-        inputs=[
-            (input_features),
-        ],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-            pcc=pcc,
-        ),
-    )
-
-
-@pytest.mark.parametrize("variant", variants, ids=variants)
-def test_whisper_decoder(test_device, variant):
-    if test_device.arch == pybuda.BackendDevice.Grayskull:
-        pytest.skip()
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.amp_level = 1
-    compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-
-    class Wrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-            self.decoder_attention_mask = torch.ones((1, 1))
-
-        def forward(self, decoder_input_ids, encoder_hidden_states):
-            dec_out = self.model.model.decoder(
-                decoder_input_ids,
-                self.decoder_attention_mask,
-                encoder_hidden_states,
-            )
-            lin_out = self.model.proj_out(dec_out[0])
-
-            return lin_out
-
-    # Load model (with tokenizer and feature extractor)
-    processor = download_model(AutoProcessor.from_pretrained, variant)
-    model_config = WhisperConfig()
-
-    # Reduce size of model for testing
-    # model_config.use_cache = False
-    # model_config.return_dict = False
-    # model_config.decoder_attention_heads = 1
-    # model_config.decoder_layers = 1
-    # model_config.encoder_attention_heads = 1
-    # model_config.encoder_layers = 1
-    # model_config.num_hidden_layers = 1
-    # model_config.d_model = 384
-    # framework_model = download_model(
-    #     WhisperForConditionalGeneration.from_pretrained,
-    #     variant,
-    #     config=model_config,
-    # )
-
-    framework_model = download_model(
-        WhisperForConditionalGeneration.from_pretrained,
-        variant,
-        use_cache=False,
-        return_dict=False,
-    )
-
-    framework_model = Wrapper(framework_model)
-    pybuda_model = PyTorchModule("pt_whisper", framework_model)
-
-    # Load and preprocess sample audio
-    sample = torch.load("pybuda/test/model_demos/utils/nlp/pytorch/1272-128104-0000.pt")
-    sample_audio = sample["audio"]["array"]
-
-    inputs = processor(sample_audio, return_tensors="pt")
-    input_features = inputs.input_features
-
-    # Get decoder inputs
-    decoder_input_ids = torch.tensor([[1, 1]]) * model_config.decoder_start_token_id
-    decoder_input_ids = decoder_input_ids.to(torch.int32)
-    encoder_outputs = framework_model.model.model.encoder(input_features)[0].detach()
-    encoder_outputs = encoder_outputs.to(torch.float32)
-
-    # Sanity run
-    out = framework_model(decoder_input_ids, encoder_outputs)
-
-    pcc = 0.96 if test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        pybuda_model,
-        [
-            (decoder_input_ids.shape, encoder_outputs.shape),
-        ],
-        inputs=[
-            (decoder_input_ids, encoder_outputs),
-        ],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=TestKind.INFERENCE,
-            pcc=pcc,
-        ),
-    )
-
-
-class Whisper_encoder(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_features):
-        return self.model.model.encoder(input_features=input_features)
-    
-class Whisper_decoder(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, position_embeds, *past_key_values):
-        presents = []
-        pkv = []
-
-        input_embeds = self.model.model.decoder.embed_tokens(decoder_input_ids)
-        hidden_states = input_embeds + position_embeds
-
-        attention_mask = _prepare_4d_causal_attention_mask(decoder_attention_mask, decoder_input_ids.size(), input_embeds, past_key_values[0].shape[2])
-
-        presents = []
-        for i, decoder_layer in enumerate(self.model.model.decoder.layers):
-            pkv = tuple([past_key_values[(i * 4) + j] for j in range(4)])
-
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_last_hidden_state,
-                layer_head_mask=None,
-                cross_attn_layer_head_mask=None,
-                past_key_value=pkv,
-                output_attentions=False,
-                use_cache=True,
-            )
-            hidden_states = layer_outputs[0]
-            presents.append(layer_outputs[1])
-
-        hidden_states = self.model.model.decoder.layer_norm(hidden_states)
-        lm_logits = self.model.proj_out(hidden_states)
-
-        return lm_logits, *presents
-
-@pytest.mark.parametrize("variant", variants, ids=variants)
-def test_whisper_enc_dec(test_device, variant):
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.amp_level = 1
-    compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
-    compiler_cfg.input_queues_on_host = True
-    compiler_cfg.compile_subgraphs = True
-    compiler_cfg.enable_link_past_cache_ios = True
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-    os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-    os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
-    os.environ["TT_BACKEND_PROFILER"] = "1"
-
-    # pybuda.set_configuration_options(performance_trace=pybuda.PerfTraceLevel.VERBOSE)
-    processor = download_model(AutoProcessor.from_pretrained, variant)
-    config = WhisperConfig.from_pretrained(variant)
-    max_length = config.max_length
-    model = download_model(
-        WhisperForConditionalGeneration.from_pretrained,
-        variant,
-        return_dict=False,
-    )
-    feature_extractor = download_model(WhisperFeatureExtractor.from_pretrained, variant)
-    tokenizer = WhisperTokenizer.from_pretrained(variant)
-    encoder_module = pybuda.PyTorchModule("Whisper_encoder", Whisper_encoder(model))
-    decoder_module_cross_attention = pybuda.PyTorchModule("Whisper_decoder_with_ca", Whisper_decoder(model))
-    decoder_module_no_cross_attention = pybuda.PyTorchModule("Whisper_decoder_no_ca", Whisper_decoder(model))
-
-    for i in range(config.decoder_layers):
-        pybuda.config.override_t_stream_shape(f"model.model.decoder.layers.{i}.self_attn.k_proj.weight_cache_nop", [13, 1])
-        pybuda.config.override_t_stream_shape(f"model.model.decoder.layers.{i}.self_attn.v_proj.weight_cache_nop", [13, 1])
-
-    sample = torch.load("pybuda/test/model_demos/utils/nlp/pytorch/1272-128104-0000.pt")
-    sample_audio = sample["audio"]["array"]
-
-    inputs = processor(sample_audio, return_tensors="pt")
-
-    input_features = inputs.input_features
-
-    encoder_last_hidden_state_shape = (1, config.max_source_positions, config.d_model)
-    encoder_last_hidden_state = torch.zeros(encoder_last_hidden_state_shape)
-
-    logits_processor = model._get_logits_processor(model.generation_config, TILE_DIM, input_features, None, LogitsProcessorList())
-    sequence_length = 1500
-    decoder_attention_mask = torch.zeros((1, max_length))
-    decoder_input_ids = torch.ones((1, TILE_DIM), dtype=torch.int) * tokenizer.pad_token_id
-    first_current_index = max_length - TILE_DIM
-    position_embeds = torch.zeros((TILE_DIM, config.d_model))
-    enc_past_cache_self_shape = (1, config.decoder_attention_heads, max_length-TILE_DIM, config.d_model // config.decoder_attention_heads)
-    enc_past_cache_cross_shape = (1, 1, 1, 1)
-
-    decoder_with_ca_inputs = [decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, position_embeds]
-    for _ in range(config.decoder_layers):
-        decoder_with_ca_inputs += [torch.zeros(enc_past_cache_self_shape), torch.zeros(enc_past_cache_self_shape),
-                   torch.zeros(enc_past_cache_cross_shape), torch.zeros(enc_past_cache_cross_shape)]
-
-    dec = Whisper_decoder(model)
-    dec(*decoder_with_ca_inputs)
-    enc_past_cache_cross_shape = (1, config.decoder_attention_heads, sequence_length, config.d_model // config.decoder_attention_heads)
-    decoder_no_ca_inputs = [decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, position_embeds]
-    for _ in range(config.decoder_layers):
-        decoder_no_ca_inputs += [torch.zeros(enc_past_cache_self_shape), torch.zeros(enc_past_cache_self_shape),
-                   torch.zeros(enc_past_cache_cross_shape), torch.zeros(enc_past_cache_cross_shape)]
-
-    tt0 = pybuda.TTDevice(
-        "tt0", 
-        devtype=test_device.devtype, 
-        arch=test_device.arch, 
-        module=[decoder_module_cross_attention, decoder_module_no_cross_attention])
-        # module=[encoder_module, decoder_module_cross_attention, decoder_module_no_cross_attention])
-
-    output_q = pybuda.initialize_pipeline(
-        training=False,
-        sample_inputs=(
-            # (input_features,),
-            (decoder_with_ca_inputs),
-            (decoder_no_ca_inputs),
-        ))
-
-    import time
-    decoder_attention_mask = torch.zeros((1, max_length))
-    decoder_input_ids[0, 0] = tokenizer.encode('<|startoftranscript|>')[0]
-    decoder_attention_mask[0, first_current_index] = 1
-    current_token_index = 0
-
-    prefix_tokens = processor.get_decoder_prompt_ids(language="english", task="transcribe")
-    for idx, token in prefix_tokens:
-        decoder_input_ids[0, idx] = token
-        decoder_attention_mask[0, first_current_index + idx] = 1
-        current_token_index = idx
-
-    # encoder hangs, for now run on cpu
-    encoder_last_hidden_state = model.model.encoder(input_features)[0].detach()
-    start = time.time()
-    # tt0.set_active_subgraph(0)
-    # tt0.push_to_inputs((input_features, ))
-    # pybuda.run_forward()
-    # ans = output_q.get()
-    # encoder_last_hidden_state = ans[0].value().detach()
-    generated_tokens = []
-    encoder_last_hidden_state_consumed = False
-    position_ids = torch.arange(32, dtype=torch.long)
-    position_embeds = model.model.decoder.embed_positions.weight[position_ids]
-    tokens_to_generate = max_length if test_device.devtype == BackendType.Silicon else 3
-    for _ in range(tokens_to_generate):
-        if not encoder_last_hidden_state_consumed:
-            encoder_last_hidden_state_consumed = True
-            tt0.set_active_subgraph(0)
-            generate_inputs = (decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, position_embeds)
-            tt0.push_to_inputs(generate_inputs)
-            pybuda.run_generate(input_count=1, write_index=current_token_index//TILE_DIM)
-            ans = output_q.get()
-        else:
-            tt0.set_active_subgraph(1)
-            generate_inputs = (decoder_input_ids, decoder_attention_mask, position_embeds)
-            tt0.push_to_inputs(generate_inputs)
-            pybuda.run_generate(input_count=1, write_index=current_token_index//TILE_DIM)
-            ans = output_q.get()
-
-        lm_head_out = ans[0].value().detach()
-        scores = logits_processor(decoder_input_ids[:, :current_token_index], lm_head_out[:, current_token_index % TILE_DIM])
-        next_token = torch.argmax(scores, dim=-1).item()
-        generated_tokens.append(next_token)
-        print(f"generated tokens: {tokenizer.decode(generated_tokens)}")
-
-        current_token_index += 1
-        if current_token_index % TILE_DIM == 0:
-            position_ids = position_ids + TILE_DIM
-            position_embeds = model.model.decoder.embed_positions.weight[position_ids]
-            decoder_attention_mask[0, :current_token_index] = 1
-            decoder_attention_mask[0, first_current_index:] = 0
-            decoder_input_ids[0, :] = tokenizer.pad_token_id
-
-        decoder_input_ids[0, current_token_index % TILE_DIM] = next_token
-        decoder_attention_mask[0, first_current_index + (current_token_index % TILE_DIM)] = 1
-    end = time.time()
-    print(f"{len(generated_tokens)} iterations took {end - start} seconds, speed: {(len(generated_tokens)) / (end - start)} iters/sec")
-    print(f"generated tokens: {tokenizer.decode(generated_tokens)}")

From fd01f2c509d943016d0b79f1e93a83a5c78b530b Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Mon, 15 Jul 2024 10:44:33 -0400
Subject: [PATCH 033/116] [Blackhole] Fix issue with DRAM channel size

(cherry picked from commit 81810092da5c996c0364cb99f694a33d3dd8feb4)
---
 pybuda/csrc/backend_api/device_config.hpp | 2 +-
 pybuda/csrc/placer/dram.hpp               | 4 ++--
 pybuda/csrc/placer/dram_allocator.cpp     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pybuda/csrc/backend_api/device_config.hpp b/pybuda/csrc/backend_api/device_config.hpp
index 4ef30d90..208df332 100644
--- a/pybuda/csrc/backend_api/device_config.hpp
+++ b/pybuda/csrc/backend_api/device_config.hpp
@@ -287,7 +287,7 @@ struct DeviceConfig
         // TODO - get from backend, but backend needs to add it
         return is_grayskull() ? 1 : 3;
     }
-    std::uint32_t get_dram_channel_capacity() const { return get<std::uint32_t>("dram-channel_capacity", false); }
+    std::size_t get_dram_channel_capacity() const { return get<std::size_t>("dram-channel_capacity", false); }
     std::size_t get_dram_bandwidth_per_block_theoretical() const
     {
         return get<std::size_t>("dram-bandwidth_per_block_theoretical", false);
diff --git a/pybuda/csrc/placer/dram.hpp b/pybuda/csrc/placer/dram.hpp
index b4a278b4..e0b627ec 100644
--- a/pybuda/csrc/placer/dram.hpp
+++ b/pybuda/csrc/placer/dram.hpp
@@ -36,9 +36,9 @@ struct DramConfig
 {
     uint32_t channel;
     uint32_t sub_channel;
-    uint32_t channel_size;
+    size_t channel_size;
     Coord location;
-    uint32_t initial_dram_offset;
+    size_t initial_dram_offset;
 
     static std::vector<DramConfig> get_config(DeviceConfig const &device_config)
     {
diff --git a/pybuda/csrc/placer/dram_allocator.cpp b/pybuda/csrc/placer/dram_allocator.cpp
index ce189c32..3ec2fbc8 100644
--- a/pybuda/csrc/placer/dram_allocator.cpp
+++ b/pybuda/csrc/placer/dram_allocator.cpp
@@ -185,8 +185,8 @@ DramAllocator::DramAllocator(
     switch (allocator_algorithm)
     {
         case BEST_FIT:
-            std::uint32_t p2p_offset;
-            std::uint32_t p2p_size;
+            std::size_t p2p_offset;
+            std::size_t p2p_size;
 
             if (chip_id == 0)
             {

From 4cc8bfbffe84010a1bb812b1e3487955d8af86fa Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Tue, 16 Jul 2024 19:08:54 +0000
Subject: [PATCH 034/116] Refactor multicard dp API to bypass compile/shutdown
 on multiple runs

(cherry picked from commit 88f9a5f5ce3214b58d1d7e3663463f9d03bce4a7)
---
 ...gitlab-ci.wormhole_b0_t3k_silicon_push.yml |  16 +
 pybuda/pybuda/tools/tti_data_parallel.py      | 846 ++++++++++--------
 pybuda/pybuda/tti/archive.py                  |  11 +-
 pybuda/test/benchmark/benchmark.py            |  44 +-
 pybuda/test/tti/test_tti_data_parallel.py     | 134 ++-
 5 files changed, 580 insertions(+), 471 deletions(-)
 create mode 100644 ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_t3k_silicon_push.yml

diff --git a/ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_t3k_silicon_push.yml b/ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_t3k_silicon_push.yml
new file mode 100644
index 00000000..a47a8e8b
--- /dev/null
+++ b/ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_t3k_silicon_push.yml
@@ -0,0 +1,16 @@
+.backend-silicon-wh-b0-t3k-common:
+  extends: .backend-silicon-wh-b0-common
+  stage: sanity-wh-b0-t3k-silicon
+  tags:
+    - t3k
+    - push
+
+pybuda-silicon-wh-b0-t3k-tti-data-parallel:
+  extends: .backend-silicon-wh-b0-t3k-common
+  script:
+    - !reference [.backend-silicon-wh-b0-t3k-common, script]
+    # Run this on x2 for now as a sanity test
+    # Move this to t3000 once we have more t3000 machines
+    # - source pybuda/test/benchmark/run_benchmark_tti_data_parallel
+    - PYBUDA_FORCE_THREADS=1 pytest -svv pybuda/test/tti/test_tti_data_parallel.py::test_tti_mmio_dp_sanity
+
diff --git a/pybuda/pybuda/tools/tti_data_parallel.py b/pybuda/pybuda/tools/tti_data_parallel.py
index 04be5763..47fcbf7c 100644
--- a/pybuda/pybuda/tools/tti_data_parallel.py
+++ b/pybuda/pybuda/tools/tti_data_parallel.py
@@ -9,6 +9,7 @@
 import os
 import queue
 import threading
+import traceback
 import shutil
 from typing import Iterable, Optional, Dict, List, Tuple, Union, Any
 import pybuda
@@ -17,29 +18,37 @@
 from enum import Enum
 
 OUTPUT_TTI_NAME = "parallel_tti_run.tti"
+
+class Status(Enum):
+    SUCCESS = "SUCCESS"
+    ERROR = "ERROR"
+    
 class RunMode(Enum):
-    FORWARD = 1
-    GENERATIVE = 2
+    FORWARD = "FORWARD"
+    GENERATIVE = "GENERATIVE"
+    
+class RunnerState(Enum):
+    UNINITIALIZED = "UNINITIALIZED",
+    INITIALIZED = "INITIALIZED",
+    SHUTDOWN = "SHUTDOWN"
     
 @dataclass
-class ForwardRunInputs:
-    inputs: Iterable[torch.Tensor] = None
+class ForwardInputs:
+    run_inputs: Iterable[torch.Tensor] = None
         
+    def __len__(self):
+        return len(self.run_inputs)
+    
+    def __getitem__(self, index):
+        return self.run_inputs[index]
+
     @staticmethod
-    def get_inputs_per_card(all_inputs: "ForwardRunInputs", num_cards: int) -> List["ForwardRunInputs"]:
-        run_inputs_per_card = split_tensor_batch(all_inputs.inputs, num_cards)
-        inputs_per_card: List[ForwardRunInputs] = []
-        for card_index in range(num_cards):
-            inputs_per_card.append(
-                ForwardRunInputs(
-                    inputs=run_inputs_per_card[card_index]
-                )
-            )
+    def split_inputs_per_card(all_inputs: "ForwardInputs", num_cards: int) -> List["ForwardInputs"]:
+        inputs_per_card = split_tensor_batch(all_inputs.run_inputs, num_cards)
         return inputs_per_card
-        
+    
 @dataclass
-class GenerativeRunInputs:
-    compile_inputs: Iterable[torch.Tensor] = None
+class GenerativeInputs:
     run_inputs: Iterable[torch.Tensor] = None
     num_tokens_to_generate: int = None
     write_index: int = 0
@@ -47,23 +56,24 @@ class GenerativeRunInputs:
     pad_token_id: Optional[int] = None
     
     def __post_init__(self):
-        assert self.compile_inputs
         assert self.run_inputs
         assert self.num_tokens_to_generate
-            
+        
+    def __len__(self):
+        return len(self.run_inputs)
+    
+    def __getitem__(self, index):
+        return self.run_inputs[index]
     
     @staticmethod
-    def get_inputs_per_card(all_inputs: "GenerativeRunInputs", num_cards: int) -> List["GenerativeRunInputs"]:
+    def split_inputs_per_card(all_inputs: "GenerativeInputs", num_cards: int) -> List["GenerativeInputs"]:
         # autograd does not support crossing process boundaries, this is an issue for whisper
         # detach all input tensors from compute graph to bypass this issue
-        compile_inputs_per_card = detach_all_tensors(split_tensor_batch(all_inputs.compile_inputs, num_cards))
         run_inputs_per_card = detach_all_tensors(split_tensor_batch(all_inputs.run_inputs, num_cards))
-        
-        inputs_per_card: List[GenerativeRunInputs] = []
+        inputs_per_card: List[GenerativeInputs] = []
         for card_index in range(num_cards):
             inputs_per_card.append(
-                GenerativeRunInputs(
-                    compile_inputs=compile_inputs_per_card[card_index],
+                GenerativeInputs(
                     run_inputs=run_inputs_per_card[card_index],
                     num_tokens_to_generate=all_inputs.num_tokens_to_generate,
                     write_index=all_inputs.write_index,
@@ -74,95 +84,84 @@ def get_inputs_per_card(all_inputs: "GenerativeRunInputs", num_cards: int) -> Li
             
         return inputs_per_card
 
-
 @dataclass
-class ForwardRunConfig:
+class CompileConfigForward:
     chip_ids: List[int] = field(default_factory=list)
-    inputs: ForwardRunInputs = None
+    compile_inputs: Iterable[torch.Tensor] = None
     tti_path: str = ""
-    loop_count: int = 0
+    # Follow flow of benchmark.py: give push inputs a 2 second head start
+    benchmark_perf: bool = False
     
     def __post_init__(self):
         assert self.chip_ids
-        assert self.inputs
+        assert self.compile_inputs
         assert self.tti_path
-        assert self.loop_count
-    
-    def inputs_for_compile(self):
-        return self.inputs.inputs
-    
-    def inputs_for_run(self):
-        return self.inputs.inputs
-    
     
 @dataclass
-class GenerativeRunConfig:
+class CompileConfigGenerative:
     chip_ids: List[int] = field(default_factory=list)
-    inputs: GenerativeRunInputs = None
+    compile_inputs: Iterable[torch.Tensor] = None
     tti_path: str = ""
     
     def __post_init__(self):
         assert self.chip_ids
-        assert self.inputs
+        assert self.compile_inputs
         assert self.tti_path
     
-    def inputs_for_compile(self):
-        return self.inputs.compile_inputs
-    
-    def inputs_for_run(self):
-        return self.inputs.run_inputs
-    
 @dataclass
-class RunEvents:
+class ProcessEvents:
     # Set by the child process when its done running
     done_event: torch.multiprocessing.Event = None
     
-    # Set by the main process when the process can be terminated
-    kill_event: torch.multiprocessing.Event = None
-    
-    # Set by the child process when the process has started
-    # In a pytest environment, pre-process-start, we run various setup functions
-    # including create-ethernet-map
-    process_start_event: torch.multiprocessing.Event = None
+    # Set by the main process to synchronize the start of the run across processes
+    run_event: torch.multiprocessing.Event = None
     
-    # Set by the first child process after it has finished loading (unzipping) the tti
-    tti_first_load_event: torch.multiprocessing.Event = None
+    # Set by the child process after it has finished initializing pipeline
+    initialize_completed_event: torch.multiprocessing.Event = None
     
-    # Optional: Set by the main process to synchronize the start of the run across processes
-    run_event: Optional[torch.multiprocessing.Event] = None
+    # Shared event between all processes, set by the main process when the process can be terminated
+    kill_event: torch.multiprocessing.Event = None
     
-    # Optional: Set by the child process after it has finished initializing pipeline
-    initialize_completed_event: Optional[torch.multiprocessing.Event] = None
+    # Shared event between all processes, set by any process that raised an error
+    error_event: torch.multiprocessing.Event = None
     
     def __post_init__(self):
         assert self.done_event
         assert self.kill_event
-        assert self.process_start_event
+        assert self.run_event
+        assert self.initialize_completed_event
+
+
+    @staticmethod
+    def wait_for_event(target_event: torch.multiprocessing.Event, error_event: torch.multiprocessing.Event, timeout=10) -> Status:
+        while True:
+            if target_event.wait(timeout=timeout):
+                return Status.SUCCESS
+            
+            if error_event.is_set():
+                return Status.ERROR
 
-    def wait_for_initialize_complete(self):
-        if self.initialize_completed_event:
-            self.initialize_completed_event.wait()
-    
-    def wait_for_run_complete(self):
-        self.done_event.wait()
-        
 @dataclass
-class RunOutputs:
-    # Contains the outputs of the run
-    output_tensors_path: Optional[str] = ""
-    
-    # Contains the start and end time of the run in tuple format (start_time, end_time)
-    perf_q: Optional[torch.multiprocessing.Queue] = None
-    
-    def get_output_tensors(self):
-        if self.output_tensors_path:
-            return torch.load(self.output_tensors_path)
-        return None
+class ProcessQueues:
+    input_queue: torch.multiprocessing.Queue = None
+    output_queue: torch.multiprocessing.Queue = None
+    perf_queue: torch.multiprocessing.Queue = None
+    config_queue: torch.multiprocessing.Queue = None
+
+    def __post_init__(self):
+        assert self.input_queue and self.output_queue and self.perf_queue
+        
+    def push_inputs(self, inputs: List[torch.Tensor]):
+        self.input_queue.put(inputs)
         
-    def get_start_end_time(self):
-        if self.perf_q is not None:
-            return self.perf_q.get()
-        return None
+    def pop_outputs(self, timeout=10):
+        return self.output_queue.get(timeout=timeout)
+
+    def get_start_end_time(self, timeout=120):
+        return self.perf_queue.get(timeout=timeout)
+    
+    def get_next_config(self, timeout=120):
+        return self.config_queue.get(timeout=timeout)
 
 @dataclass
 class RunResult:
@@ -200,88 +199,77 @@ def get_total_runtime(self):
 class ForwardRun:
     # Runs the tti on a single device and gathers outputs
     @staticmethod
-    def _multi_thread_forward_run(config: ForwardRunConfig, events: RunEvents, output_wrapper: RunOutputs):
-        # Create ethernet map runs at the beginning of every process in a pytest environment
-        # Create ethernet map is not process safe
-        events.process_start_event.set()
-        
-        tt0 = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_overrides=config.chip_ids)
-        
-        # For the first device process, set the event to notify the main process the tti has been unzipped
-        # So that the main process can launch other processes
-        # Prevents processes from racing to unzip the tti
-        if events.tti_first_load_event:
-            events.tti_first_load_event.set()
+    def _multi_thread_forward_run(compile_config: CompileConfigForward, events: ProcessEvents, queues: ProcessQueues):
+        
+        tt0 = pybuda.TTDevice.load_image(img_path=compile_config.tti_path, device_id_overrides=compile_config.chip_ids)
             
-        device_output_q = pybuda.initialize_pipeline(training=False, sample_inputs=config.inputs_for_compile())
-        all_outputs = []
+        device_output_q = pybuda.initialize_pipeline(training=False, sample_inputs=compile_config.compile_inputs)
         
-        def push_inputs_thread(tt_device: pybuda.TTDevice, inputs, loop_count: int):
+        def push_inputs_thread(tt_device: pybuda.TTDevice, main_process_input_q, loop_count: int):
             for _ in range(loop_count):
                 if pybuda.error_raised():
                     print(" * Aborting input thread due to error")
                     return
+                inputs = main_process_input_q.get(timeout=60)
                 tt_device.push_to_inputs(inputs)
                 
-        def pop_outputs_thread(output_q, all_outputs, loop_count: int):
+        def pop_outputs_thread(device_output_q, main_process_output_q, loop_count: int):
             for _ in range(loop_count):
                 while True:
                     try:
-                        outputs = output_q.get(timeout=1)
-                        all_outputs.append(outputs)
+                        outputs = device_output_q.get(timeout=1)
+                        main_process_output_q.put([output.to_pytorch() for output in outputs])
                         break
                     except queue.Empty as _:
                         if pybuda.error_raised():
                             print(" * Aborting output thread due to error")
                             return
-                        
-        if events.initialize_completed_event:
-            events.initialize_completed_event.set()
-
-        all_outputs = []
-        output_thread = threading.Thread(target=pop_outputs_thread, args=(device_output_q, all_outputs, config.loop_count))
-        input_thread = threading.Thread(target=push_inputs_thread, args=(tt0, config.inputs_for_run(), config.loop_count))
-        
-        # mimicking pybuda/test/benchmark/benchmark.py
-        # Wait for this event to be set and start running
-        if events.run_event:
-            events.run_event.wait()
-            
-        output_thread.start()
-        input_thread.start()
-        time.sleep(2)  # Let the input thread start up and transfer initial data, reaching something like "steady state"
-        
-        start = time.time()
         
-        pybuda.run_forward(input_count=config.loop_count)
+        events.initialize_completed_event.set()
         
-        input_thread.join()
-        output_thread.join()
-        
-        end = time.time()
-        
-        if output_wrapper.output_tensors_path:
-            all_outputs_torch = []
-            for outputs in all_outputs:
-                all_outputs_torch.append([output.to_pytorch() for output in outputs])
-                
-            logger.info(f"Saving outputs temporarily to {output_wrapper.output_tensors_path} for main process to pick up, this may take a while for large outputs")
-            torch.save(all_outputs_torch, output_wrapper.output_tensors_path)
-        
-        if output_wrapper.perf_q:
-            output_wrapper.perf_q.put((start, end))
-        
-        pybuda.shutdown()
-        
-        # Reading tensors from queues requires this process to be alive
-        # Set done_event to notify the main process that outputs can be read
-        # Wait for kill_event to terminate the process
-        events.done_event.set()
-        events.kill_event.wait()
+        while not events.kill_event.is_set() and not events.error_event.is_set():
+            if not events.run_event.wait(timeout=1):
+                continue
+            
+            loop_count = queues.get_next_config()
+            input_thread = threading.Thread(target=push_inputs_thread, args=(tt0, queues.input_queue, loop_count))
+            output_thread = threading.Thread(target=pop_outputs_thread, args=(device_output_q, queues.output_queue, loop_count))
 
+            input_thread.start()
+            output_thread.start()
+            
+            if compile_config.benchmark_perf:
+                time.sleep(2) # Let the input thread start up and transfer initial data, reaching something like "steady state"
+            
+            start = time.time()
+            
+            pybuda.run_forward(input_count=loop_count)
+            
+            input_thread.join()
+            output_thread.join()
+            
+            end = time.time()
+            
+            queues.perf_queue.put((start, end))
+            events.run_event.clear()
+            events.done_event.set()
+
+    @staticmethod
+    def multi_thread_forward_run(compile_config: CompileConfigForward, events: ProcessEvents, queues: ProcessQueues):
+        try:
+            ForwardRun._multi_thread_forward_run(compile_config, events, queues)
+            
+        except Exception as e:
+            logger.error(f"Process running on chips {compile_config.chip_ids} raised an exception: {str(e)}")
+            print(traceback.format_exc())
+            events.error_event.set()
+            
+        finally:
+            pybuda.shutdown()
+            
     @staticmethod
-    def _create_run_result(
-         # List of outputs per card, per loop
+    def create_run_result(
+         # List of outputs per card_index, per loop
         outputs_per_card: List[List[List[torch.tensor]]], 
         per_card_runtime: Dict[int, Tuple[float, float]]
     ):
@@ -323,115 +311,115 @@ def _create_run_result(
 
 # Namespace for generative run APIs
 class GenerativeRun:
+    
+    # Currently all generative models are tested with batch 1 in benchmark.py. These models may not work with larger batch.
+    # Assume batch 1 inputs for now
     @staticmethod
-    def _single_thread_generative_model_run(config: GenerativeRunConfig, events: RunEvents, output_wrapper: RunOutputs):
-        # Create ethernet map runs at the beginning of every process in a pytest environment
-        # Create ethernet map is not process safe
-        events.process_start_event.set()
+    def _single_thread_generative_model_run(compile_config: CompileConfigGenerative, events: ProcessEvents, queues: ProcessQueues):
         
         from pybuda.pybudaglobal import TILE_DIM
-        compile_inputs = config.inputs_for_compile()
-        run_inputs = config.inputs_for_run()
-        
-        first_device = pybuda.TTDevice.load_image(img_path=config.tti_path, device_id_overrides=config.chip_ids)
         
-        # For the first device process, set the event to notify the main process the tti has been unzipped
-        # So that the main process can launch other processes
-        # Prevents processes from racing to unzip the tti
-        if events.tti_first_load_event:
-            events.tti_first_load_event.set()
+        first_device = pybuda.TTDevice.load_image(img_path=compile_config.tti_path, device_id_overrides=compile_config.chip_ids)
             
-        output_q = pybuda.initialize_pipeline(training=False, sample_inputs=compile_inputs)
-        
-        if events.initialize_completed_event:
-            events.initialize_completed_event.set()
-        
-        first_current_index = config.inputs.first_current_index
-        pad_token_id = config.inputs.pad_token_id
-        write_index = config.inputs.write_index
-        loop_count = 1
-        num_tokens_to_generate = config.inputs.num_tokens_to_generate
-        
-        input_ids = run_inputs[0]
-        encoder_attention_mask = run_inputs[1]
-        decoder_input_ids = run_inputs[2]
-        decoder_attention_mask = run_inputs[3]
-        is_text_inputs = (first_current_index is not None)
-        
-        
-        if events.run_event:
-            events.run_event.wait()
+        output_q = pybuda.initialize_pipeline(training=False, sample_inputs=compile_config.compile_inputs)
+        
+        events.initialize_completed_event.set()
+        while not events.kill_event.is_set() and not events.error_event.is_set():
+            if not events.run_event.wait(timeout=1):
+                continue
+    
+            run_inputs: GenerativeInputs = queues.input_queue.get(timeout=60)
+            first_current_index = run_inputs.first_current_index
+            pad_token_id = run_inputs.pad_token_id
+            write_index = run_inputs.write_index
+            loop_count = 1
+            num_tokens_to_generate = run_inputs.num_tokens_to_generate
             
-        start_time = time.time()
+            input_ids = run_inputs[0]
+            encoder_attention_mask = run_inputs[1]
+            decoder_input_ids = run_inputs[2]
+            decoder_attention_mask = run_inputs[3]
+            is_text_inputs = (first_current_index is not None)
+                
+            start_time = time.time()
 
-        first_device.set_active_subgraph(0)
-        if is_text_inputs:
-            first_device.push_to_inputs((input_ids, encoder_attention_mask)) 
-        else:
-            first_device.push_to_inputs((input_ids,))
-            
-        pybuda.run_forward()
-        ans = output_q.get()
-        encoder_last_hidden_state = ans[0].value().detach()
-        generated_tokens = []
-
-        current_token_index = 0 
-        for _ in range(num_tokens_to_generate):  
-            if current_token_index == 0:
-                first_device.set_active_subgraph(1)
-                generate_inputs = (decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, encoder_attention_mask)
-                first_device.push_to_inputs(generate_inputs)
-                pybuda.run_generate(input_count=loop_count, write_index=write_index)
-                ans = output_q.get()
+            first_device.set_active_subgraph(0)
+            if is_text_inputs:
+                first_device.push_to_inputs((input_ids, encoder_attention_mask)) 
             else:
-                if current_token_index == 1:
-                    start_time1 = time.time()
-                first_device.set_active_subgraph(2)
-                generate_inputs = (decoder_input_ids, decoder_attention_mask, encoder_attention_mask)
-                first_device.push_to_inputs(generate_inputs)
-                pybuda.run_generate(input_count=loop_count, write_index=write_index)
-                ans = output_q.get()
-
-            if is_text_inputs or current_token_index < 2:
-                current_token_index += 1
-
-            if is_text_inputs:        
-                lm_head_out = ans[0].value().detach()
-                next_token = torch.argmax(lm_head_out[0, (current_token_index-1) % TILE_DIM])
-                generated_tokens.append(next_token)
-    
-                if current_token_index % TILE_DIM == 0:
-                    past_cache_pages = current_token_index // TILE_DIM
-                    # after one page of past cache, we have to rotate. 
-                    first_device.set_active_subgraph(3)
-                    pybuda.run_generate(input_count=0, write_index=0)
-
-                    pages_current = 1
-                    decoder_attention_mask[0, -(past_cache_pages + pages_current) * TILE_DIM:] = 1
-                    decoder_attention_mask[0, first_current_index:] = 0
-                    decoder_input_ids[0, :] = pad_token_id
-
-                decoder_input_ids[0, current_token_index % TILE_DIM] = next_token
-                decoder_attention_mask[0, first_current_index + (current_token_index % TILE_DIM)] = 1
-
-        end_time = time.time()
-        
-        if output_wrapper.output_tensors_path:
-            torch.save(generated_tokens, output_wrapper.output_tensors_path)
-            
-        if output_wrapper.perf_q:
-            output_wrapper.perf_q.put((start_time, end_time))
-            
-        pybuda.shutdown()
+                first_device.push_to_inputs((input_ids,))
+                
+            pybuda.run_forward()
+            ans = output_q.get()
+            encoder_last_hidden_state = ans[0].value().detach()
+            generated_tokens = []
+
+            current_token_index = 0 
+            for _ in range(num_tokens_to_generate):  
+                if current_token_index == 0:
+                    first_device.set_active_subgraph(1)
+                    generate_inputs = (decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, encoder_attention_mask)
+                    first_device.push_to_inputs(generate_inputs)
+                    pybuda.run_generate(input_count=loop_count, write_index=write_index)
+                    ans = output_q.get()
+                else:
+                    if current_token_index == 1:
+                        start_time1 = time.time()
+                    first_device.set_active_subgraph(2)
+                    generate_inputs = (decoder_input_ids, decoder_attention_mask, encoder_attention_mask)
+                    first_device.push_to_inputs(generate_inputs)
+                    pybuda.run_generate(input_count=loop_count, write_index=write_index)
+                    ans = output_q.get()
 
-        events.done_event.set()
-        events.kill_event.wait()
+                if is_text_inputs or current_token_index < 2:
+                    current_token_index += 1
 
+                if is_text_inputs:        
+                    lm_head_out = ans[0].value().detach()
+                    next_token = torch.argmax(lm_head_out[0, (current_token_index-1) % TILE_DIM])
+                    generated_tokens.append(next_token)
+        
+                    if current_token_index % TILE_DIM == 0:
+                        past_cache_pages = current_token_index // TILE_DIM
+                        # after one page of past cache, we have to rotate. 
+                        first_device.set_active_subgraph(3)
+                        pybuda.run_generate(input_count=0, write_index=0)
+
+                        pages_current = 1
+                        decoder_attention_mask[0, -(past_cache_pages + pages_current) * TILE_DIM:] = 1
+                        decoder_attention_mask[0, first_current_index:] = 0
+                        decoder_input_ids[0, :] = pad_token_id
+
+                    decoder_input_ids[0, current_token_index % TILE_DIM] = next_token
+                    decoder_attention_mask[0, first_current_index + (current_token_index % TILE_DIM)] = 1
+
+            end_time = time.time()
+            
+            queues.output_queue.put(generated_tokens)
+                
+            queues.perf_queue.put((start_time, end_time))
+            
+            events.run_event.clear()
+            events.done_event.set()
+        
+    @staticmethod
+    def single_thread_generative_model_run(compile_config: CompileConfigGenerative, events: ProcessEvents, queues: ProcessQueues):
     # TODO: Implement output merging for n300 data-parallel generative runs once its supported
+        try:
+            GenerativeRun._single_thread_generative_model_run(compile_config, events, queues)
+            
+        except Exception as e:
+            logger.error(f"Process running on chips {compile_config.chip_ids} raised an exception: {str(e)}")
+            print(traceback.format_exc())
+            events.error_event.set()
+            
+        finally:
+            pybuda.shutdown()
+            
     @staticmethod
-    def _create_run_result(
-        # List of outputs per card
-        # each inner list is the list of generated tokens of that card, of length num_tokens_to_generate
+    def create_run_result(
+        # List of outputs per card_index
+        # each inner list is the list of generated tokens of that card_index, of length num_tokens_to_generate
         outputs_per_card: List[List[torch.tensor]], 
         per_card_runtime: Dict[int, Tuple[float, float]]
     ):
@@ -440,13 +428,231 @@ def _create_run_result(
         
         return RunResult(outputs_per_card, per_card_start_time, per_card_end_time)
     
+class MultiCardRunner:
+    def __init__(
+        self, 
+        run_mode: RunMode,
+        # Configs per card_index
+        device_ids: List[List[int]],
+        output_dir: str,
+        name: str = "MultiCardRunner"
+    ):
+        assert len(device_ids) > 0
+        assert run_mode in [RunMode.FORWARD, RunMode.GENERATIVE]
+        assert output_dir
+        self.name = name
+        self._state: RunnerState = RunnerState.UNINITIALIZED
+        self._run_mode: RunMode = run_mode
+        self._num_cards: int = len(device_ids)
+        self._device_ids: List[List[int]] = device_ids
+        self._output_dir: str = output_dir
+
+        self._mp_context = None
+        self._processes: List[torch.multiprocessing.Process] = None
+        self._all_events: List[ProcessEvents] = []
+        self._all_queues: List[ProcessQueues] = []
+        
+        # Shared events between all processes
+        self._kill_event = None
+        self._error_event = None
+        
+    def initialize(self, compile_configs: Union[List[CompileConfigForward], List[CompileConfigGenerative]]) -> None:
+        assert self._state == RunnerState.UNINITIALIZED, "Can't re-initialize a MultiCardRunner"
+        
+        self._mp_context = torch.multiprocessing.get_context('spawn')
+        self._processes: List[torch.multiprocessing.Process] = []
+        
+        # init shared events 
+        self._kill_event = self._mp_context.Event()
+        self._error_event = self._mp_context.Event()
+
+        if self._run_mode == RunMode.FORWARD:
+            runner_function = ForwardRun.multi_thread_forward_run
+            
+        elif self._run_mode == RunMode.GENERATIVE:
+            runner_function = GenerativeRun.single_thread_generative_model_run
+        
+        for card_index, config in enumerate(compile_configs):
+            events = ProcessEvents(
+                run_event=self._mp_context.Event(),
+                done_event=self._mp_context.Event(),
+                initialize_completed_event=self._mp_context.Event(),
+                kill_event=self._kill_event,
+                error_event=self._error_event
+            )
+            queues = ProcessQueues(
+                input_queue=self._mp_context.Queue(),
+                output_queue=self._mp_context.Queue(),
+                perf_queue=self._mp_context.Queue(),
+                config_queue=self._mp_context.Queue(),
+            )
+            
+            self._all_events.append(events)
+            self._all_queues.append(queues)
+            
+            p = self._mp_context.Process(
+                target=runner_function, 
+                args=(config, events, queues)
+            )
+            p.start()
+            self._processes.append(p)
+            status = ProcessEvents.wait_for_event(events.initialize_completed_event, events.error_event)
+            if status == Status.ERROR:
+                self._terminate_and_report()
+        
+        self._state = RunnerState.INITIALIZED
+        logger.info(f"{self.name}: Initialize completed on all {self._num_cards} cards")
+    
+    def _run_forward(self, all_inputs: ForwardInputs) -> RunResult:
+        num_loops = len(all_inputs)
+        inputs_per_card: List[ForwardInputs] = ForwardInputs.split_inputs_per_card(all_inputs, self._num_cards)
+        outputs_per_card = [[] for _ in range(self._num_cards)]
+        
+        def pop_outputs(card_index: int, num_loops: int, error_event: torch.multiprocessing.Event):
+            for _ in range(num_loops):
+                while True:
+                    try:
+                        outputs = self._all_queues[card_index].pop_outputs(timeout=10)
+                        outputs_per_card[card_index].append(outputs)
+                        break
+                    except queue.Empty as _:
+                        if error_event.is_set():
+                            return
+        
+        pop_output_threads: List[threading.Thread] = []
+        for card_index in range(self._num_cards):
+            t = threading.Thread(target=pop_outputs, args=(card_index, num_loops, self._error_event))
+            t.start()
+            pop_output_threads.append(t)
+            self._assert(not self._all_events[card_index].run_event.is_set(), "Unexpected run event set before starting run")
+            self._all_events[card_index].run_event.set()
+            self._all_queues[card_index].config_queue.put(num_loops)
+            
+        for i in range(num_loops):
+            if self._error_event.is_set():
+                break
+            for card_index, forward_inputs in enumerate(inputs_per_card):
+                self._all_queues[card_index].push_inputs(forward_inputs[i])
+                if self._error_event.is_set():
+                    break
+  
+        self._wait_for_all_processes_done()
+        
+        for t in pop_output_threads:
+            t.join()
+        
+        if self._error_event.is_set():
+            self._terminate_and_report()
+        
+        per_card_start_end = {card_index: self._all_queues[card_index].get_start_end_time() for card_index in range(self._num_cards)}
+    
+        run_result: RunResult = ForwardRun.create_run_result(outputs_per_card, per_card_start_end)
+        
+        return run_result
+    
+    def _run_generate(self, all_inputs: GenerativeInputs) -> RunResult:
+        inputs_per_card = GenerativeInputs.split_inputs_per_card(all_inputs, self._num_cards)
+        outputs_per_card = [[] for _ in range(self._num_cards)]
+        
+        # TODO: Maybe update this after we confirm that multi-batch works for generative run
+        def pop_outputs(card_index: int, error_event: torch.multiprocessing.Event):
+            while True:
+                try:
+                    outputs = self._all_queues[card_index].pop_outputs(timeout=10)
+                    outputs_per_card[card_index] = outputs
+                    break
+                except queue.Empty as _:
+                    if error_event.is_set():
+                        return
+        
+        pop_output_threads: List[threading.Thread] = []
+        for card_index in range(self._num_cards):
+            t = threading.Thread(target=pop_outputs, args=(card_index, self._error_event))
+            t.start()
+            pop_output_threads.append(t)
+            self._assert(not self._all_events[card_index].run_event.is_set(), "Unexpected run event set before starting run")
+            self._all_events[card_index].run_event.set()
+        
+        # Assuming batch 1, push all inputs at once
+        # Since batch size is 1, this shouldn't blow up the queue
+        for card_index, generative_inputs in enumerate(inputs_per_card):
+            self._all_queues[card_index].push_inputs(generative_inputs)
+            if self._error_event.is_set():
+                break
+
+        self._wait_for_all_processes_done()
+
+        for t in pop_output_threads:
+            t.join()
+        
+        if self._error_event.is_set():
+            self._terminate_and_report()
+        
+        per_card_start_end = {card_index: self._all_queues[card_index].get_start_end_time() for card_index in range(self._num_cards)}
+    
+        run_result: RunResult = GenerativeRun.create_run_result(outputs_per_card, per_card_start_end)
+        
+        return run_result
+
+    def run(
+        self, 
+        all_inputs: Union[ForwardInputs, GenerativeInputs], 
+    ) -> RunResult:
+        assert self._state == RunnerState.INITIALIZED
+        logger.info(f"{self.name}: Launching {self._run_mode.value} run")
+        if self._run_mode == RunMode.FORWARD:                
+            self._assert(isinstance(all_inputs, ForwardInputs), "Expected ForwardInputs for forward run")
+            result: RunResult = self._run_forward(all_inputs)
+        
+        elif self._run_mode == RunMode.GENERATIVE:
+            self._assert(isinstance(all_inputs, GenerativeInputs), "Expected GenerativeInputs for generative run")
+            result: RunResult = self._run_generate(all_inputs)
+            
+        return result
+    
+
+    def _wait_for_all_processes_done(self) -> Status:
+        for events in self._all_events:
+            status = ProcessEvents.wait_for_event(events.done_event, events.error_event)
+            if status == Status.SUCCESS:
+                # Unset the done event for the next run
+                events.done_event.clear()
+            elif status == Status.ERROR:
+                break
+            
+        return status
+
+    def _assert(self, cond, message: str = "") -> None:
+        if cond:
+            return
+        logger.error(f"{self.name}: error on main process with message {message}")
+        self._error_event.set()
+        self._terminate_and_report()
+
+    def _terminate_and_report(self) -> None:
+        self.shutdown()
+        raise RuntimeError(f"{self.name}: Aborted due to error raised on one or more processes.")
+        
+    def shutdown(self) -> None:
+        self._state = RunnerState.SHUTDOWN
+        self._kill_event.set()
+        for p in self._processes:
+            p.join()
+    
+        self._processes = []
+        self._all_events = []
+        self._all_queues = []
+        self._kill_event = None
+        self._error_event = None
+        self._mp_context = None
+    
 def _encode_chip_ids(chip_ids: List[int]) -> str:
     return "_".join([str(chip_id) for chip_id in chip_ids])
 
 def _initialize_tti_image(
     output_dir: str,
     precompiled_tti_path: Optional[str] = None,
-):
+) -> str:
     # copy tti over to the output directory if it isn't already there
     precompiled_tti_path = os.path.realpath(precompiled_tti_path)
     precompiled_tti_name = os.path.basename(precompiled_tti_path)
@@ -456,92 +662,6 @@ def _initialize_tti_image(
             
     return image_path
 
-def _run(
-    run_mode: RunMode,
-    configs: Union[List[ForwardRunConfig], List[GenerativeRunConfig]],
-    output_dir: str,
-    sync_at_run_start: bool,
-    rm_tmp_dirs: bool,
-):
-    procs = []
-    device_ids_per_card = [config.chip_ids for config in configs]
-    num_cards = len(device_ids_per_card)
-    
-    mp_context = torch.multiprocessing.get_context('spawn')
-    all_events: List[RunEvents] = []
-    all_output_wrappers: List[RunOutputs] = []
-    # Shared events 
-    kill_event = mp_context.Event()
-    run_event = mp_context.Event() if sync_at_run_start else None
-
-    if run_mode == RunMode.FORWARD:
-        runner = ForwardRun._multi_thread_forward_run
-        
-    elif run_mode == RunMode.GENERATIVE:
-        runner = GenerativeRun._single_thread_generative_model_run
-    
-    # Temporary directories for each device to dump intermediates such as outputs
-    tmp_dirs = [os.path.join(output_dir, f"tmp_device_{_encode_chip_ids(chip_ids)}") for chip_ids in device_ids_per_card]
-    for tmp_dir in tmp_dirs:
-        os.makedirs(tmp_dir, exist_ok=True)
-    
-    for card_index, config in enumerate(configs):
-        events = RunEvents(
-            run_event=run_event,
-            kill_event=kill_event,
-            process_start_event=mp_context.Event(),
-            done_event=mp_context.Event(),
-            tti_first_load_event=mp_context.Event() if card_index == 0 else None,
-            initialize_completed_event=mp_context.Event() if sync_at_run_start else None,
-        )
-        output_wrapper = RunOutputs(
-            output_tensors_path=os.path.join(tmp_dirs[card_index], f"output_tensors_{_encode_chip_ids(config.chip_ids)}.pth"),
-            perf_q=mp_context.Queue(),
-        )
-        all_events.append(events)
-        all_output_wrappers.append(output_wrapper)
-        p = mp_context.Process(
-            target=runner, 
-            args=(config, events, output_wrapper)
-        )
-        p.start()
-        procs.append(p)
-        events.process_start_event.wait()
-        if events.tti_first_load_event:
-            events.tti_first_load_event.wait()
-
-    if sync_at_run_start:
-        for device_events in all_events:
-            device_events.wait_for_initialize_complete()
-            
-        logger.info(f"Initialize completed on all {num_cards} cards, launching run")
-        run_event.set()
-    
-    for device_events in all_events:
-        device_events.wait_for_run_complete()
-    
-    outputs_per_card = [output_wrapper.get_output_tensors() for output_wrapper in all_output_wrappers]
-    per_card_start_end = {i: all_output_wrappers[i].get_start_end_time() for i in range(num_cards)}
-    
-    # Terminate the processes after reading the outputs
-    kill_event.set()
-    for proc_id, p in enumerate(procs):
-        p.join()
-        logger.info(f"Devices {device_ids_per_card[proc_id]} finished run successfully")
-
-    # Clean up intermediate directories
-    if rm_tmp_dirs:
-        logger.info("Cleaning up temporary directories")
-        for tmp_dir in tmp_dirs:
-            shutil.rmtree(tmp_dir)
-    
-    if run_mode == RunMode.FORWARD:
-        run_result: RunResult = ForwardRun._create_run_result(outputs_per_card, per_card_start_end) 
-    elif run_mode == RunMode.GENERATIVE:
-        run_result: RunResult = GenerativeRun._create_run_result(outputs_per_card, per_card_start_end)
-        
-    return run_result
-    
 def split_tensor_batch(input_data, num_cards: int):
     '''
     Splits tensors in input data recursively
@@ -585,33 +705,28 @@ def detach_all_tensors(data):
         raise TypeError("Input data should contain list or torch tensor only")
     
     return data
-            
-def run_tti_data_parallel(
+
+def initialize_multicard_runner(
     arch: pybuda.BackendDevice,
     device_ids: List[List[int]],
     run_mode: RunMode,
-    inputs: Union[ForwardRunInputs, GenerativeRunInputs],
-    sync_at_run_start: bool = False,
-    rm_tmp_dirs: bool = True,
+    compile_inputs: Iterable[torch.Tensor],
     precompiled_tti_path: str = None,
     output_dir: str = "./device_images",
-    num_loops: Optional[int] = None,
-) -> "RunResult":
+    benchmark_perf: bool = False
+) -> MultiCardRunner:
     '''
-    User-facing API. Run a tti on multiple cards in parallel.
     Arguments: 
     - arch: Architecture of the devices.
     - device_ids: List of device ids to run the tti on, each sublist should start with mmio-mapped device id.
     - run_mode: Mode to run on. Currently supports forward and generative runs.
-    - inputs: List of inputs to run the tti on.
-    - sync_at_run_start: If True, the processes will wait until all processes are ready to run before starting the run.
-    - rm_tmp_dirs: If True, remove all temporary directories created for each card.
+    - compile_inputs: List of sample inputs to be used for compilation.
     - precompiled_tti_path: Path to a precompiled tti image to run on the cards.
     - output_dir: Directory to store the ttis as well as the unzipped tti directories. If it doesn't exist, one will be created.
         If precompiled_tti_path is provided, the tti will be copied to this directory.
-    - num_loops: Number of loops to run the tti. For generative runs, this will be hardcoded to 1.
+    - benchmark_perf: For internal perf analysis, to mimic the behaviour of benchmark.py for forward runs
     Returns:
-    - RunResult object containing the merged outputs and start/end times of the run on each card.
+    - MultiCardRunner object that the user can use to run on the targeted device_ids.
     '''
     assert arch in [pybuda.BackendDevice.Wormhole_B0, pybuda.BackendDevice.Grayskull], "Unsupported device architecture"
     assert precompiled_tti_path
@@ -621,46 +736,49 @@ def run_tti_data_parallel(
     if arch == pybuda.BackendDevice.Wormhole_B0 and os.environ.get("PYBUDA_FORCE_THREADS", "0") != "1":
         logger.warning("PYBUDA_FORCE_THREADS is not set, this may cause errors when running on multiple devices due to parallel execution of create-ethernet-map")
     
+    # initialize output directory
     output_dir = os.path.realpath(output_dir)
-    
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     
+    # copy tti over to output directory if it doesn't exist
     image_path = _initialize_tti_image(
         output_dir=output_dir,
         precompiled_tti_path=precompiled_tti_path,
     )
     
+    # Create per-card compile configs
     if run_mode == RunMode.FORWARD:
-        assert isinstance(inputs, ForwardRunInputs)
-        inputs_per_card = ForwardRunInputs.get_inputs_per_card(inputs, len(device_ids))
-        configs: List[ForwardRunConfig] = [
-            ForwardRunConfig(
+        compile_inputs_per_card = split_tensor_batch(compile_inputs, len(device_ids))
+        compile_configs: List[CompileConfigForward] = [
+            CompileConfigForward(
                 chip_ids=devices,
-                inputs=inputs_per_card[card],
+                compile_inputs=compile_inputs_per_card[card_index],
                 tti_path=image_path,
-                loop_count=num_loops,
-            ) for card, devices in enumerate(device_ids)
+                benchmark_perf=benchmark_perf,
+            ) for card_index, devices in enumerate(device_ids)
         ]
-        
+    
     elif run_mode == RunMode.GENERATIVE:
-        assert isinstance(inputs, GenerativeRunInputs)
-        inputs_per_card = GenerativeRunInputs.get_inputs_per_card(inputs, len(device_ids))          
-        image_path = _initialize_tti_image(
-            output_dir=output_dir,
-            precompiled_tti_path=precompiled_tti_path,
-        )
-        configs: List[GenerativeRunConfig] = [
-            GenerativeRunConfig(
+        compile_inputs_per_card = detach_all_tensors(split_tensor_batch(compile_inputs, len(device_ids)))
+        compile_configs: List[CompileConfigGenerative] = [
+            CompileConfigGenerative(
                 chip_ids=devices,
-                inputs=inputs_per_card[card],
+                compile_inputs=compile_inputs_per_card[card_index],
                 tti_path=image_path,
-            ) for card, devices in enumerate(device_ids)
+            ) for card_index, devices in enumerate(device_ids)
         ]
         
     else:
-        raise TypeError("Invalid run mode provided. Supported modes are FORWARD and GENERATIVE.")
+        raise ValueError("Invalid run mode provided. Supported modes are FORWARD and GENERATIVE.")
+    
+    runner: MultiCardRunner = MultiCardRunner(
+        run_mode=run_mode,
+        device_ids=device_ids,
+        output_dir=output_dir
+    )
     
-    run_result: RunResult = _run(run_mode=run_mode, configs=configs, output_dir=output_dir, sync_at_run_start=sync_at_run_start, rm_tmp_dirs=rm_tmp_dirs)
+    runner.initialize(compile_configs=compile_configs)
     
-    return run_result
\ No newline at end of file
+    return runner
+
diff --git a/pybuda/pybuda/tti/archive.py b/pybuda/pybuda/tti/archive.py
index 5048c597..97ba0705 100644
--- a/pybuda/pybuda/tti/archive.py
+++ b/pybuda/pybuda/tti/archive.py
@@ -647,6 +647,7 @@ def _create_device_override_backend_output_dir(
         device_id_overrides: List[int]
     ) -> str:
         new_backend_output_dir = TTIArchive._get_override_backend_output_path(original_backend_output_dir, device_id_overrides)
+        
         if os.path.exists(new_backend_output_dir):
             logger.info("TTDeviceImage: Using existing device override binaries directory {}", new_backend_output_dir)
             return new_backend_output_dir
@@ -656,15 +657,17 @@ def _create_device_override_backend_output_dir(
 
         # Remove the original netlist and copy over the override netlist to the new binaries directory
         original_netlist_name = os.path.basename(original_netlist_path)
-        os.remove(os.path.join(new_backend_output_dir, original_netlist_name))
+        new_backend_dir_original_netlist_path = os.path.join(new_backend_output_dir, original_netlist_name)
+        if os.path.exists(new_backend_dir_original_netlist_path):
+            os.remove(new_backend_dir_original_netlist_path)
         
         # Path to the netlist file override directly under unzipped_tti directory
         override_netlist_path = TTIArchive._get_override_netlist_path(original_netlist_path, device_id_overrides)
         override_netlist_name = os.path.basename(override_netlist_path)
         
-        override_netlist_path_in_backend_outdir = os.path.join(new_backend_output_dir, override_netlist_name)
+        new_backend_dir_override_netlist_path = os.path.join(new_backend_output_dir, override_netlist_name)
         
-        TTIArchive._copy_netlist_yaml(netlist_yaml=override_netlist_path, dst_dir=override_netlist_path_in_backend_outdir)
+        TTIArchive._copy_netlist_yaml(netlist_yaml=override_netlist_path, dst_dir=new_backend_dir_override_netlist_path)
 
         old_device_to_new_device_map = TTIArchive._get_original_device_to_new_device_map(original_netlist_path, device_id_overrides)
         
@@ -676,7 +679,7 @@ def _create_device_override_backend_output_dir(
         
         if os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1":
             # Update device id suffix in trisc firmware directories
-            TTIArchive._update_n300_dp_trisc_firmware_directories(override_netlist_path_in_backend_outdir, old_device_to_new_device_map)
+            TTIArchive._update_n300_dp_trisc_firmware_directories(new_backend_dir_override_netlist_path, old_device_to_new_device_map)
 
         return new_backend_output_dir
     
diff --git a/pybuda/test/benchmark/benchmark.py b/pybuda/test/benchmark/benchmark.py
index 52c8e952..483b8e48 100755
--- a/pybuda/test/benchmark/benchmark.py
+++ b/pybuda/test/benchmark/benchmark.py
@@ -47,10 +47,11 @@
 from pybuda.tools.tti_data_parallel import (
     RunMode,
     RunResult,
-    ForwardRunInputs,
-    GenerativeRunInputs,
+    ForwardInputs,
+    GenerativeInputs,
+    MultiCardRunner,
     split_tensor_batch,
-    run_tti_data_parallel,
+    initialize_multicard_runner,
 )
 
 def single_thread_generative_model_run(args, first_device, last_device, inputs, targets, output_q, num_tokens_to_generate, first_current_index, pad_token_id, write_index):
@@ -254,12 +255,12 @@ def duplicate_tensor(data, factor: int):
 
 def data_parallel_tti_run(
     arch: BackendDevice,
-    inputs: Union[ForwardRunInputs, GenerativeRunInputs],
+    compile_inputs: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+    run_inputs: Union[ForwardInputs, GenerativeInputs],
     run_mode: RunMode,
-    loop_count: int, 
     output_dir: str,
     # List of mmio mapped device ids
-    device_ids: List[int],
+    device_ids: List[List[int]],
     tt_device: Optional[pybuda.TTDevice] = None, 
     precompiled_image_path: Optional[str] = None,
 ):
@@ -267,9 +268,7 @@ def data_parallel_tti_run(
     assert tt_device or precompiled_image_path, "One of tt_device or precompiled_image_path must be specified"
     assert not (tt_device and precompiled_image_path), "Specify one of tt_device, precompiled_image_path"
     num_devices = len(device_ids)
-    
-    compile_inputs = inputs.inputs if run_mode == RunMode.FORWARD else inputs.compile_inputs
-    
+        
     if tt_device is not None:
         image_path = os.path.join(output_dir, "parallel_tti_run.tti")
         single_device_inputs = split_tensor_batch(compile_inputs, num_devices)[0]
@@ -280,18 +279,23 @@ def data_parallel_tti_run(
 
     pybuda.pybuda_reset()
     
-    # Don't check outputs here, just care about perf
-    run_result: RunResult = run_tti_data_parallel(
+    runner: MultiCardRunner = initialize_multicard_runner(
         arch=arch,
         device_ids=device_ids,
         run_mode=run_mode,
-        inputs=inputs, 
-        output_dir=output_dir, 
-        num_loops=loop_count,
+        compile_inputs=compile_inputs,
         precompiled_tti_path=image_path,
-        sync_at_run_start=True,
+        output_dir=output_dir,
+        benchmark_perf=True,
     )
     
+    # Don't check outputs here, just care about perf
+    run_result: RunResult = runner.run(
+        all_inputs=run_inputs
+    )
+    
+    runner.shutdown()
+    
     return run_result.get_earliest_start(), run_result.get_latest_end()
 
 def print_start_info():
@@ -433,7 +437,8 @@ def run(
         
         if not num_tokens_to_generate:
             run_mode = RunMode.FORWARD
-            all_inputs = ForwardRunInputs(inputs=inputs)
+            all_inputs = ForwardInputs(run_inputs=[inputs] * args.loop_count)
+            compile_inputs = inputs
             
         else:
             run_mode = RunMode.GENERATIVE
@@ -441,8 +446,7 @@ def run(
             inputs = duplicate_batch(inputs, len(device_list))
             compile_inputs = duplicate_batch(compile_inputs, len(device_list))
             
-            all_inputs = GenerativeRunInputs(
-                compile_inputs=compile_inputs,
+            all_inputs = GenerativeInputs(
                 run_inputs=inputs,
                 num_tokens_to_generate=num_tokens_to_generate,
                 write_index=write_index,
@@ -452,9 +456,9 @@ def run(
             
         start_time, end_time = data_parallel_tti_run(
             arch=arch,
-            inputs=all_inputs,
+            compile_inputs=compile_inputs,
+            run_inputs=all_inputs,
             run_mode=run_mode,
-            loop_count=args.loop_count,
             output_dir=args.parallel_tti,
             device_ids=device_ids,
             tt_device=tt_device,
diff --git a/pybuda/test/tti/test_tti_data_parallel.py b/pybuda/test/tti/test_tti_data_parallel.py
index 41a8b5e3..cf845373 100755
--- a/pybuda/test/tti/test_tti_data_parallel.py
+++ b/pybuda/test/tti/test_tti_data_parallel.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, List
+from typing import Optional, List, Tuple
 import shutil
 import pybuda
 import pybuda.backend
@@ -12,12 +12,13 @@
 from pybuda.pybudaglobal import pybuda_reset
 import numpy as np
 from pybuda.tools.tti_data_parallel import (
-    split_tensor_batch,
-    run_tti_data_parallel, 
     RunMode,
-    RunResult, 
-    ForwardRunInputs,
-    GenerativeRunInputs
+    RunResult,
+    ForwardInputs,
+    GenerativeInputs,
+    MultiCardRunner,
+    split_tensor_batch,
+    initialize_multicard_runner,
 )
 import sys
 sys.path.insert(1, "pybuda")
@@ -105,14 +106,27 @@ def get_model_config(base_kwargs, model, config):
     return func(**kwargs)
 
 
+def generate_random_inputs(sample_inputs, count):
+    input_shapes = [t.shape for t in sample_inputs]
+    all_inputs = []
+    for _ in range(count):
+        inputs = []
+        for shape in input_shapes:
+            inputs.append(torch.randn(*shape))
+        
+        all_inputs.append(inputs)
+        
+    return all_inputs
+
 def test_tti_mmio_dp_sanity():
     clean_env = os.environ.copy()
     device_list = pybuda.detect_available_devices()
     assert device_list, "No devices available"
     
-    mmio_device_ids = [[0]]
+    mmio_device_ids = [[i] for i in range(len(device_list))]
     arch = device_list[0]
-    num_loops = 16
+    num_loops = 2
+    num_inputs_per_loop = 2
     total_microbatch_size = 128
     
     base_kwargs = {
@@ -127,101 +141,55 @@ def test_tti_mmio_dp_sanity():
     model_to_config = {
         "resnet": "resnet50", 
         "bert": "base", 
-        "mobilenet_v2": "224"
     }
 
     output_dir = "device_images_multi_mmio/"
     os.makedirs(output_dir, exist_ok=True)
-    
     for model, config in model_to_config.items():
         model_config = get_model_config(base_kwargs, model, config)
         duts, inputs, targets, other = model_config
+        multi_inputs = [inputs] * num_inputs_per_loop
         module = duts['tt']
         image_path = os.path.join(output_dir, f"{model}.tti")
-        compile_and_save_tti(
-            module=module,
-            arch=arch,
-            chip_ids=[0],
-            tti_output_path=image_path,
-            sample_inputs=inputs,
-        )
-        run_result: RunResult = run_tti_data_parallel(
-            precompiled_tti_path=image_path,
-            run_mode=RunMode.FORWARD,
-            inputs=ForwardRunInputs(inputs=inputs),
-            arch=arch,
-            device_ids=mmio_device_ids,
-            num_loops=num_loops,
-            output_dir=output_dir,
-            sync_at_run_start=True
-        )
-        outputs = run_result.outputs
-        cpu_outputs = [module.cpu_eval_forward(*inputs)] * num_loops
-        check_outputs(cpu_outputs, outputs)
-        
-        pybuda_reset()
-        os.environ = clean_env
-    
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-
-# Sanity test that runs on a single card
-def test_tti_n300_dp_sanity():
-    clean_env = os.environ.copy()
-    device_list = pybuda.detect_available_devices()
-    assert device_list, "No devices available"
-    assert os.environ.get("PYBUDA_N300_DATA_PARALLEL", "0") == "1"
-    
-    device_ids = [[0, 1]]
-    arch = device_list[0]
-    num_loops = 16
-    total_microbatch_size = 128
         
-    base_kwargs = {
-        "training": False, 
-        "microbatch": total_microbatch_size, 
-        "data_type": 'Fp16_b',
-        "math_fidelity": 'HiFi3',
-        "arch": "wormhole_b0",
-        "devtype": "silicon",
-    }
-    
-    model_to_config = {
-        "resnet": "resnet50", 
-        "bert": "base"
-    }
-    
-    output_dir="device_images_n300_dp/"
-    os.makedirs(output_dir, exist_ok=True)
-    
-    for model, config in model_to_config.items():
-        model_config = get_model_config(base_kwargs, model, config)
-        duts, inputs, targets, other = model_config
-        module = duts['tt']
-        image_path = os.path.join(output_dir, f"{model}.tti")
+        single_device_inputs = split_tensor_batch(inputs, len(mmio_device_ids))[0]
         compile_and_save_tti(
             module=module,
             arch=arch,
             num_chips=1,
             tti_output_path=image_path,
-            sample_inputs=inputs,
+            sample_inputs=single_device_inputs,
         )
-        run_result: RunResult = run_tti_data_parallel(
-            precompiled_tti_path=image_path,
+
+        # Generate device outputs
+        runner = initialize_multicard_runner(
+            arch=pybuda.BackendDevice.Wormhole_B0,
+            device_ids=mmio_device_ids,
             run_mode=RunMode.FORWARD,
-            inputs=ForwardRunInputs(inputs=inputs),
-            arch=arch,
-            device_ids=device_ids,
-            num_loops=num_loops,
-            output_dir=output_dir,
-            sync_at_run_start=True
+            compile_inputs=inputs,
+            precompiled_tti_path=image_path,
+            output_dir=output_dir
         )
-        outputs = run_result.outputs
-        cpu_outputs = [module.cpu_eval_forward(*inputs)] * num_loops
-        check_outputs(cpu_outputs, outputs)
         
+        all_outputs = []
+        for _ in range(num_loops):
+            run_result: RunResult = runner.run(ForwardInputs(run_inputs=multi_inputs))
+            all_outputs.append(run_result.outputs)
+            
+        runner.shutdown()
+        
+        # Generate cpu outputs
+        all_cpu_outputs = []
+        for single_inputs in multi_inputs:
+            all_cpu_outputs.append(module.cpu_eval_forward(*single_inputs))
+            
+        all_cpu_outputs = [all_cpu_outputs] * num_loops
+        
+        # Compare outputs, check PCC
+        check_outputs(all_outputs, all_cpu_outputs)
+            
         pybuda_reset()
         os.environ = clean_env
     
     if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
\ No newline at end of file
+        shutil.rmtree(output_dir)

From 9e6b59c88aa5c019560c1de72a184abea063abd9 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 17 Jul 2024 12:58:22 +0000
Subject: [PATCH 035/116] Upgrade onnx and onnxruntime, make necessary
 docker/make changes

(cherry picked from commit a6093953e71ef3d9da1d1d8c00fb47515238cbbd)
---
 python_env/core_requirements.txt | 4 ++--
 python_env/module.mk             | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python_env/core_requirements.txt b/python_env/core_requirements.txt
index 0a59f6e3..bc0b31c4 100644
--- a/python_env/core_requirements.txt
+++ b/python_env/core_requirements.txt
@@ -26,8 +26,8 @@ multiprocess==0.70.13
 mxnet==1.9.1
 networkx==2.8.5
 numpy==1.23.1
-onnx==1.15.0
-onnxruntime==1.16.3
+onnx==1.16.0
+onnxruntime==1.18.1
 opencv-python-headless==4.6.0.66
 # This is needed to avoid issue https://yyz-gitlab.local.tenstorrent.com/devops/devops/-/issues/95
 pandas==1.5.3
diff --git a/python_env/module.mk b/python_env/module.mk
index af99023a..ed0356ce 100644
--- a/python_env/module.mk
+++ b/python_env/module.mk
@@ -8,6 +8,7 @@ python_env: $(PYTHON_ENV)/.installed
 .PRECIOUS: $(PYTHON_ENV)/.installed $(PYTHON_ENV)/%
 $(PYTHON_ENV)/.installed: python_env/requirements.txt
 	$(PYTHON_VERSION) -m venv $(PYTHON_ENV)
+	bash -c "unset LD_PRELOAD; source build/python_env/bin/activate && pip3 install --upgrade pip"
 	bash -c "unset LD_PRELOAD; source $(PYTHON_ENV)/bin/activate && pip3 install wheel==0.37.1"
 	bash -c "unset LD_PRELOAD; source $(PYTHON_ENV)/bin/activate && pip3 install -r python_env/requirements.txt -f https://download.pytorch.org/whl/cpu/torch_stable.html"
 	touch $@

From e9d4436752bc43696597c2a40ef6d235aee696fe Mon Sep 17 00:00:00 2001
From: dsudhakar <dsudhakar@tenstorrent.com>
Date: Mon, 15 Jul 2024 15:02:38 +0000
Subject: [PATCH 036/116] Remove nlp tensorflow models

(cherry picked from commit fd9c73854d590bd450747f0199eda7bd28ad19ac)
---
 .../tvm/nlp/tensorflow/tests_A/test_albert.py | 176 -------------
 .../nlp/tensorflow/tests_B/test_gptj_tf.py    |  39 ---
 .../nlp/tensorflow/tests_B/test_wav2vec2.py   | 231 ------------------
 .../tvm/nlp/tensorflow/tests_C/test_xlm.py    |  95 -------
 .../tvm/nlp/tensorflow/tests_C/test_xlnet.py  |  61 -----
 5 files changed, 602 deletions(-)
 delete mode 100644 pybuda/test/tvm/nlp/tensorflow/tests_A/test_albert.py
 delete mode 100644 pybuda/test/tvm/nlp/tensorflow/tests_B/test_wav2vec2.py
 delete mode 100644 pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlm.py
 delete mode 100644 pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlnet.py

diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_A/test_albert.py b/pybuda/test/tvm/nlp/tensorflow/tests_A/test_albert.py
deleted file mode 100644
index fba95460..00000000
--- a/pybuda/test/tvm/nlp/tensorflow/tests_A/test_albert.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Some basic bring-up tests of tracing functionality
-#
-import pytest
-
-import numpy as np
-import tensorflow as tf
-from transformers import AlbertConfig, TFAlbertModel
-from transformers.models.albert.modeling_tf_albert import TFAlbertAttention, TFAlbertLayer
-
-from pybuda import (
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-)
-from pybuda.config import CompileDepth
-from test.tvm.utils import evaluate_framework_vs_pybuda
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-model_config_v1 = {
-  "_name_or_path": "albert-base-v1",
-  "architectures": [
-    "AlbertForMaskedLM"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 2,
-  "classifier_dropout_prob": 0.1,
-  "down_scale_factor": 1,
-  "embedding_size": 128,
-  "eos_token_id": 3,
-  "gap_size": 0,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "inner_group_num": 1,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "albert",
-  "net_structure_type": 0,
-  "num_attention_heads": 12,
-  "num_hidden_groups": 1,
-  "num_hidden_layers": 12,
-  "num_memory_blocks": 0,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "torchscript": True,
-  "transformers_version": "4.12.2",
-  "type_vocab_size": 2,
-  "vocab_size": 30000
-}
-
-
-def test_albert_v1(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.POST_INITIAL_GRAPH_PASS
-    
-    class TF_AlbertAttention(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFAlbertAttention(config)
-
-        def call(self, hidden_states):
-            return self.layer(hidden_states, None, None, None)
-
-    config = AlbertConfig(**model_config_v1)
-    
-    model = TF_AlbertAttention(config)
-    mod = TFModule(
-        "albert_attention_tf",
-        model,
-    )
-    input_shape = (1, 768, 768)
-
-    relative_atol = 0.1
-    if test_kind == TestKind.INFERENCE and test_device.devtype == BackendType.Silicon:
-        relative_atol = 0.3
-
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            relative_atol=relative_atol,
-            waive_gradient_errors={"tf__albert_attention/tf_albert_attention/key/bias:0",
-                "tf__albert_attention_1/tf_albert_attention_1/key/bias:0"},
-        )
-    )
-
-
-model_config_v2 = {
-  "_name_or_path": "albert-base-v2",
-  "architectures": [
-    "AlbertForMaskedLM"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 2,
-  "classifier_dropout_prob": 0.1,
-  "down_scale_factor": 1,
-  "embedding_size": 128,
-  "eos_token_id": 3,
-  "gap_size": 0,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "inner_group_num": 1,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "albert",
-  "net_structure_type": 0,
-  "num_attention_heads": 12,
-  "num_hidden_groups": 1,
-  "num_hidden_layers": 12,
-  "num_memory_blocks": 0,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "torchscript": True,
-  "transformers_version": "4.12.2",
-  "type_vocab_size": 2,
-  "vocab_size": 30000
-}
-
-def test_albert_v2(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.POST_INITIAL_GRAPH_PASS
-    
-    class TF_AlbertAttention(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFAlbertAttention(config)
-
-        def call(self, hidden_states):
-            return self.layer(hidden_states, None, None, None)
-
-    config = AlbertConfig(**model_config_v2)
-    
-    model = TF_AlbertAttention(config)
-    mod = TFModule(
-        "albert_attention_tf",
-        model,
-    )
-    input_shape = (1, 768, 768)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            waive_gradient_errors={"tf__albert_attention/tf_albert_attention/key/bias:0",
-                "tf__albert_attention_3/tf_albert_attention_3/key/bias:0"},
-        )
-    )
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
index 680f89df..07e511a8 100644
--- a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
+++ b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
@@ -44,12 +44,6 @@ def fixed_pos_embedding(x: tf.Tensor, seq_dim: int = 1, seq_len: Optional[int] =
     return tf.cast(tf.sin(sinusoid_inp), dtype=x.dtype), tf.cast(tf.cos(sinusoid_inp), dtype=x.dtype)
 
 
-def rotate_every_two(x: tf.Tensor) -> tf.Tensor:
-    rotate_half_tensor = tf.stack((-x[:, :, :, 1::2], x[:, :, :, ::2]), axis=-1)
-    new_shape = shape_list(rotate_half_tensor)[:-2] + [tf.math.reduce_prod(shape_list(rotate_half_tensor)[-2:])]
-    rotate_half_tensor = tf.reshape(rotate_half_tensor, new_shape)
-    return rotate_half_tensor
-
 
 def apply_rotary_pos_emb(x: tf.Tensor, sincos: tf.Tensor, offset: int = 0) -> tf.Tensor:
     sin_pos, cos_pos = sincos
@@ -59,40 +53,7 @@ def apply_rotary_pos_emb(x: tf.Tensor, sincos: tf.Tensor, offset: int = 0) -> tf
 
 
 
-def test_tvm_rotate_every_two(test_kind, test_device):
-    if test_kind.is_training():
-        pytest.skip()
-
-    class GPTJRotateEveryTwo(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-
-        def call(self, key):
-            seq_len = key.shape[1]
-            k_rot = key[:, :, :, :64]
-            k_pass = key[:, :, :, 64:]
-            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
-            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=0)
-            key = tf.concat([k_rot, k_pass], axis=-1)
 
-            return key
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    config = GPTJConfig()
-    model = GPTJRotateEveryTwo(config)
-    mod = TFModule("fixed_pos_embedding_tf", model)
-
-    input_shape = (1, 128, 16, 256)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
 
 @pytest.mark.skip(reason="Tested with fallback")
 def test_gptj_block(test_kind, test_device):
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_wav2vec2.py b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_wav2vec2.py
deleted file mode 100644
index 0cf01547..00000000
--- a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_wav2vec2.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-#
-# Wav2Vec2 basic bring-up tests of tracing functionality
-#
-import pytest
-
-import tensorflow as tf
-from transformers import Wav2Vec2Config
-from transformers.models.wav2vec2.modeling_tf_wav2vec2 import (
-    TFWav2Vec2Model,
-    TFWav2Vec2FeatureEncoder,
-    TFWav2Vec2FeatureProjection,
-    TFWav2Vec2WeightNormConv1D,
-    TFWav2Vec2Encoder,
-)
-
-from pybuda import (
-    TFModule,
-    VerifyConfig,
-)
-from pybuda.config import CompileDepth
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_wav2vec2_full_model(test_kind, test_device):
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    else:
-        # Unsupported backward pass for concatenate op
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    config = Wav2Vec2Config()
-    framework_module = TFWav2Vec2Model(config)
-
-    module = TFModule(
-        "wav2vec2_feature_encoder",
-        framework_module,
-    )
-
-    input_shape = (1, 512)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_wav2vec2_feature_encoder(test_kind, test_device):
-    pytest.skip()  # Tested in full model, useful for debugging.
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    class TFWav2Vec2_FeatureEncoder(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFWav2Vec2FeatureEncoder(config)
-
-        def call(self, input_values):
-            return self.layer(
-                input_values,
-            )
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        # Unsupported HW ops
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        # TODO: Tensor mismatch on bw_in0_gelu_197_multiply_1 from layernorm_196
-        compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    config = Wav2Vec2Config()
-    framework_module = TFWav2Vec2_FeatureEncoder(config)
-    module = TFModule(
-        "wav2vec2_feature_encoder",
-        framework_module,
-    )
-
-    input_shape = (1, 512)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_wav2vec2_feature_projection(test_kind, test_device):
-    pytest.skip()  # Tested in full model, useful for debugging.
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    class TFWav2Vec2_FeatureProjection(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFWav2Vec2FeatureProjection(config, name="feature_projection")
-
-        def call(self, input_values):
-            return self.layer(
-                input_values,
-            )
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        # Segmentation fault on balancer - access params through backend api
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-    else:
-        # Segmentation fault on balancer - access params through backend api
-        compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    config = Wav2Vec2Config()
-    framework_module = TFWav2Vec2_FeatureProjection(config)
-    module = TFModule(
-        "wav2vec2_feature_projection",
-        framework_module,
-    )
-
-    input_shape = (1, 512)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_wav2vec2_conv1d_with_norm(test_kind, test_device):
-    pytest.skip()  # Tested in full model, useful for debugging.
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    class TFWav2Vec2_WeightNormConv1D(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFWav2Vec2WeightNormConv1D(
-                filters=config.hidden_size,
-                kernel_size=config.num_conv_pos_embeddings,
-                groups=config.num_conv_pos_embedding_groups,
-                explicit_padding=config.num_conv_pos_embeddings // 2,
-                name="conv",
-            )
-
-        def call(self, input_values):
-            return self.layer(
-                input_values,
-            )
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        compiler_cfg.compile_depth = CompileDepth.FULL
-        pytest.skip()
-
-    config = Wav2Vec2Config()
-    config.num_hidden_layers = 1
-    framework_module = TFWav2Vec2_WeightNormConv1D(config)
-    module = TFModule(
-        "wav2vec2_conv1d_with_norm",
-        framework_module,
-    )
-
-    input_shape = (1, 1, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
-
-
-def test_wav2vec2_encoder(test_kind, test_device):
-    pytest.skip()  # Tested in full model, useful for debugging.
-    if test_kind == TestKind.TRAINING:  # only run recompute test in post-commit
-        pytest.skip()
-
-    class TFWav2Vec2_Encoder(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFWav2Vec2Encoder(config, name="encoder")
-
-        def call(self, input_values):
-            return self.layer(
-                input_values,
-            )
-
-    compiler_cfg = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    else:
-        compiler_cfg.compile_depth = CompileDepth.FULL
-
-    config = Wav2Vec2Config()
-    config.num_hidden_layers = 1
-    framework_module = TFWav2Vec2_Encoder(config)
-    module = TFModule(
-        "wav2vec2_encoder",
-        framework_module,
-    )
-
-    input_shape = (1, 1, 768)
-    verify_module(
-        module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-    )
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlm.py b/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlm.py
deleted file mode 100644
index 55e9b816..00000000
--- a/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlm.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-
-import tensorflow as tf
-from transformers.models.xlm import XLMConfig
-from transformers.models.xlm.modeling_tf_xlm import TFXLMMultiHeadAttention, TFXLMTransformerFFN
-
-import math
-import itertools
-from pybuda import (
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-)
-from test.tvm.utils import evaluate_framework_vs_pybuda
-
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_xlm_attention_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    class TFXLM_MHA(tf.keras.Model):
-        def __init__(self):
-            super().__init__()
-            self.config = XLMConfig()
-            self.layer = TFXLMMultiHeadAttention(self.config.n_heads, self.config.emb_dim, self.config)
-
-        def call(self, hidden_states, mask):
-            return self.layer(hidden_states, mask, None, None, None, False, False)
-
-    model = TFXLM_MHA()
-
-    mod = TFModule("XLM_attention_tf", model)
-
-    input_shape = (1, 16, 2048)
-    mask_shape = (1, 16)
-    verify_module(
-        mod,
-        (input_shape, mask_shape),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        uniform_inputs=True,
-    )
-
-def test_tvm_xlm_FFN_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
-
-    class TFXLM_FFN(tf.keras.Model):
-        def __init__(self):
-            super().__init__()
-            self.config = XLMConfig()
-            self.layer = TFXLMTransformerFFN(
-                self.config.emb_dim,
-                self.config.emb_dim * 4, 
-                self.config.emb_dim,
-                self.config)
-
-        def call(self, hidden_states):
-            return self.layer(hidden_states,)
-
-    model = TFXLM_FFN()
-
-    mod = TFModule("XLM_ffn_tf", model)
-
-    input_shape = (1, 16, 2048)
-    verify_module(
-        mod,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlnet.py b/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlnet.py
deleted file mode 100644
index c7170bbc..00000000
--- a/pybuda/test/tvm/nlp/tensorflow/tests_C/test_xlnet.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from pybuda.config import CompileDepth
-import pytest
-import tensorflow as tf
-from transformers import XLNetConfig
-from transformers.models.xlnet.modeling_tf_xlnet import TFXLNetLayer,TFXLNetMainLayer
-
-from pybuda import (
-    TFModule,
-    TTDevice,
-    BackendType,
-    CompilerConfig,
-    VerifyConfig,
-    optimizers,
-    pybuda_compile,
-    tvm_to_python,
-)
-from pybuda.config import CompileDepth, _get_global_compiler_config
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-
-
-def test_tvm_xlm_attention_tf(test_kind, test_device):
-    if test_kind == TestKind.TRAINING: # only run recompute test in post-commit
-        pytest.skip()
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
-
-    class TFXLNet_Layer(tf.keras.Model):
-        def __init__(self, config):
-            super().__init__()
-            self.layer = TFXLNetLayer(config)
-
-        def call(self, hidden_states,pos_emb):
-            # Pybuda -> TVM compile removes batch dim.
-            hidden_states = tf.transpose(hidden_states, perm=[1, 0, 2])
-            pos_emb = tf.transpose(pos_emb, perm=[1, 0, 2])
-
-            return self.layer(hidden_states, None, None, None, pos_emb, None, None, None, None, False, False)
-
-    config = XLNetConfig()
-    submodel = TFXLNet_Layer(config)
-
-    mod = TFModule("XLM_attention_tf", submodel)
-
-    input_shape = (1, 16, 1024)
-    pos_emb = (1, 32, 1024)
-
-    verify_module(
-        mod,
-        (input_shape, pos_emb, ),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        ),
-        uniform_inputs=True,
-    )
\ No newline at end of file

From 90f521aba0efd972ebbf0f9c9ae062a8a9b0481c Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Fri, 19 Jul 2024 07:22:37 +0000
Subject: [PATCH 037/116] Test all element-wise binary operators according to
 test plan

(cherry picked from commit 62d1285242fc5cd05ef6df34e230ccf72d30f696)
---
 .../models/test_plan/__init__.py              |  11 +
 .../test_plan/model_op_src_const_eval_pass.py |  45 ++
 .../test_plan/model_op_src_from_another_op.py |  32 ++
 .../test_plan/model_op_src_from_dram_queue.py |  23 +
 .../model_op_src_from_dram_queue_prologued.py |  35 ++
 .../test_plan/model_op_src_from_host.py       |  23 +
 .../test_plan/model_op_src_from_tm_edge1.py   |  29 ++
 .../test_plan/model_op_src_from_tm_edge2.py   |  30 ++
 .../eltwise_binary/test_eltwise_binary.py     | 463 +++++++++++++++++-
 9 files changed, 688 insertions(+), 3 deletions(-)
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
 create mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py

diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py b/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
new file mode 100644
index 00000000..580a3bd3
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from .model_op_src_from_another_op import BudaElementWiseBinaryTest
+from .model_op_src_from_tm_edge1 import BudaElementWiseBinaryTest
+from .model_op_src_from_tm_edge2 import BudaElementWiseBinaryTest
+from .model_op_src_from_host import BudaElementWiseBinaryTest
+from .model_op_src_from_dram_queue import BudaElementWiseBinaryTest
+from .model_op_src_from_dram_queue_prologued import BudaElementWiseBinaryTest
+from .model_op_src_const_eval_pass import BudaElementWiseBinaryTest
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
new file mode 100644
index 00000000..5007e2dc
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   when operand sorce from constants input
+
+import pybuda
+import torch
+
+from pybuda import PyBudaModule
+from test.operators.utils import ShapeUtils
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src const eval pass")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src const eval pass"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+        def my_rand(*shape, requires_grad=False):
+            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+        
+        self.constant_shape = ShapeUtils.reduce_microbatch_size(shape)
+
+        self.add_constant("c1")
+        self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
+
+        self.add_constant("c2")
+        self.set_constant("c2", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
+       
+        self.inputs = [
+            pybuda.Tensor.create_from_torch(my_rand(*self.shape))
+        ]
+
+    def forward(self, x, y):
+        v1 = self.operator(self.opname + "0", self.get_constant("c1"), self.get_constant("c2"))
+        # v2 and v3 consume inputs
+        v2 = pybuda.op.Add("Add1", x, y)
+        v3 = pybuda.op.Add("Add2", v1, v2)
+        return v3
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
new file mode 100644
index 00000000..6d33d453
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   when operand sorce is from another operator
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from another op")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from another op"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        # we use Add and Subtract operators to create two operands which are inputs for the binary operator
+        xx = pybuda.op.Add("Add0", x, y)
+        yy = pybuda.op.Subtract("Subtract0", x, y)
+        output = self.operator(self.opname + "1", xx, yy)
+        return output
+
+    # TODO: check do we need this
+    # def values(self):
+    #     return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
new file mode 100644
index 00000000..c497ffd4
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   when operand sorce is from dram queue
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        output = self.operator(self.opname + "0", x, y)
+        return output
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
new file mode 100644
index 00000000..ad30217c
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   when operand sorce is from dram queue
+
+import pybuda
+import torch
+
+from pybuda import PyBudaModule
+from test.operators.utils import ShapeUtils
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue prologued")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue prologued"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+        def my_rand(*shape, requires_grad=False):
+            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+
+        self.shape_input = ShapeUtils.reduce_microbatch_size(shape)
+
+        self.add_constant("c")
+        self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
+
+    def forward(self, x):
+        output = self.operator(self.opname + "0", self.get_constant("c"), x)
+        return output
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
new file mode 100644
index 00000000..1de7810a
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   when operand sorce is from host
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from host")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from host"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):        
+        output = self.operator(self.opname + "0", x, y)
+        return output
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
new file mode 100644
index 00000000..8f8ec076
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   and when operand sorce is from tm edge
+#   Combination: operator -> tm -> input
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge1")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge1"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        xx = pybuda.op.Add("Add0", x, y)
+        yy = pybuda.op.tm.Transpose("Transpose0", xx, -1, -2)
+        output = self.operator(self.opname + "1", yy, yy)
+        return output
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py
new file mode 100644
index 00000000..5f9948df
--- /dev/null
+++ b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+#   Model for testing element-wise binary operators
+#   and when operand sorce is from tm edge
+#   Combination: - tm -> input
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseBinaryTest(PyBudaModule):
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge2")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge2"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        # 
+        xx = pybuda.op.tm.Transpose("Transpose0", x, -1, -2)
+        yy = pybuda.op.tm.Transpose("Transpose1", y, -1, -2)
+        output = self.operator(self.opname + "2", xx, yy)
+        return output
diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index 8363a17c..a95f4d78 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -5,17 +5,474 @@
 # Tests for testing of element-wise binary operators
 #
 # In this test we use pytorch tensors and operators to verify buda operators
-#
+
+
+
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (+)  2.1 From another op
+#       - Operator -> input
+# (+)  2.2 From tm edge
+#       - Combination: operator -> tm -> input
+#       - tm -> input
+# (+)  2.3 From DRAM queue
+#       - input_queue flag = false
+#       - Special case of From host? May it be triggered if the operator is not the first node of the network?
+#       - Can this be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# (+)  2.4 From DRAM, but prologued (constant)
+#       - Constants must be small enough to fit into L1
+#       - Verification via netlists that scenario is triggered
+#       - Input are not prologued for microbatch size = 1
+# (+)  2.5 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants. Does it make difference if tensor is big > L1
+#       - Verification via netlists that scenario is triggered???
+# (+)  2.6 From host
+#       - Input tensor as input of network -> Operator is first node in network and input_queue flag = true
+#       - Can this scenario be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# 3 Operand shapes type(s):
+# (+)  3.1 Full tensor (i.e. full expected shape)
+#       - Is 3 dims max for all ops? Ex. Conv is 3d max
+# (+)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (/)  3.3 Scalar
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (+)  4.1 Divisible by 32
+# (+)  4.2 Prime numbers
+# (+)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (+)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (/)  5.1 Output DF
+# (/)  5.2 Intermediate DF
+# (/)  5.3 Accumulation DF
+# (+)  5.4 Operand DFs
+# (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+# (-) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+
 
 import os
 import pytest
 import numpy as np
 
+from typing import List, Dict
+from loguru import logger
+
 import pybuda
 import pybuda.op
-from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
+
+from pybuda.op_repo import TensorShape
+from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.conftest import TestDevice
+
+from pybuda import TTDevice, pybuda_compile, VerifyConfig, CompilerConfig
 
 from . import models
+from .models import test_plan
+
+TEST_PLAN_MODELS_PATH = "./pybuda/test/operators/eltwise_binary/models/test_plan/"
+
+
+def verify(
+    test_device: TestDevice,
+    input_model: str,
+    input_operator: str,
+    input_shape: TensorShape,
+    number_of_operands: int,
+    input_params: List[Dict] = [],
+    input_source_flag: InputSourceFlags = None,
+    dev_data_format: pybuda.DataFormat = None,
+    math_fidelity: pybuda.MathFidelity = None,
+):
+    '''Common verification function for all tests'''
+
+    architecture = f'test_plan.{input_model}.BudaElementWiseBinaryTest(operator=pybuda.op.{input_operator}, opname="{input_operator}", shape={input_shape})'
+    model = eval(architecture)
+
+    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
+    logger.trace(f"***input_shapes: {input_shapes}")
+
+    if input_source_flag:
+        CompilerUtils.set_input_source(input_source_flag.value)
+
+    if math_fidelity:
+        CompilerUtils.set_math_fidelity(math_fidelity)
+
+    if dev_data_format:
+        input_params.append({"dev_data_format": dev_data_format})
+
+    VerifyUtils.verify(model, test_device, input_shapes, input_params)
+
+
+def get_eltwise_binary_ops():
+    return [
+        "Add",              #00
+        "Max",              #01
+        "Min",              #02
+        "Power",            #03
+        "Subtract",         #04
+        "Multiply",         #05
+        "Heaviside",        #06
+        "Greater",          #07
+        "GreaterEqual",     #08
+        "Less",             #09
+        "LessEqual",        #10
+        "Equal",            #11
+        "NotEqual",         #12
+    ]
+
+def get_input_shapes():
+    return [
+            # 2-dimensional shape, microbatch_size = 1:
+            (1, 4),                     #00      # 3.1 Full tensor (i.e. full expected shape)
+            (1, 17),                    #01      # 3.1 Full tensor (i.e. full expected shape)
+            (1, 23),                    #02      # 3.2 Tensor reduce on one or more dims to 1
+            (1, 1),                     #03      # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100),                   #04      # 4.3 Very large (thousands, 10s of thousands)
+            (1, 500),                   #05      # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1000),                  #06      # 4.4 Extreme ratios between height/width
+            (1, 1920),                  #07      # 4.4 Extreme ratios between height/width
+            (1, 10000),                 #08      # 4.4 Extreme ratios between height/width
+            (1, 64),                    #09      # 4.1 Divisible by 32
+            (1, 96),                    #10     # 4.1 Divisible by 32
+            (1, 41),                    #11     # 4.2 Prime numbers
+            (1, 3),                     #12     # 4.2 Prime numbers
+            
+            # 2-dimensional shape, microbatch_size > 1:
+            # All shapes fails for all operators
+            pytest.param((3, 4),        #13      # 3.1 Full tensor (i.e. full expected shape)
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((45, 17),      #14      # 3.1 Full tensor (i.e. full expected shape)
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((64, 1),       #15      # 3.2 Tensor reduce on one or more dims to 1
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((100, 100),    #16      # 4.3 Very large (thousands, 10s of thousands)
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((1000, 100),   #17      # 4.3 Very large (thousands, 10s of thousands)
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((10, 1000),    #18      # 4.4 Extreme ratios between height/width
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((9920, 1),     #19      # 4.4 Extreme ratios between height/width  
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((10000, 1),    #20      # 4.4 Extreme ratios between height/width 
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((32, 64),      #21      # 4.1 Divisible by 32
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((160, 96),     #22      # 4.1 Divisible by 32
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((17, 41),      #23      # 4.2 Prime numbers
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+            pytest.param((89, 3),       #24      # 4.2 Prime numbers
+                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+
+            # 3-dimensional shape, microbatch_size = 1:
+            (1, 3, 4),                  #25     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 45, 17),                #26     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 1, 23),                 #27     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 64, 1),                 #28     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100, 100),              #29     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1000, 100),             #30     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 10, 1000),              #31     # 4.4 Extreme ratios between height/width
+            (1, 9920, 1),               #32     # 4.4 Extreme ratios between height/width
+            (1, 10000, 1),              #33     # 4.4 Extreme ratios between height/width 
+            (1, 32, 64),                #34     # 4.1 Divisible by 32
+            (1, 160, 96),               #35     # 4.1 Divisible by 32
+            (1, 17, 41),                #36     # 4.2 Prime numbers
+            (1, 89, 3),                 #37     # 4.2 Prime numbers
+
+            # 3-dimensional shape, microbatch_size > 1:
+            (2, 3, 4),                  #38     # 3.1 Full tensor (i.e. full expected shape)
+            (11, 45, 17),               #39     # 3.1 Full tensor (i.e. full expected shape)
+            (11, 1, 23),                #40     # 3.2 Tensor reduce on one or more dims to 1
+            (11, 64, 1),                #41     # 3.2 Tensor reduce on one or more dims to 1
+            (100, 100, 100),            #42     # 4.3 Very large (thousands, 10s of thousands)
+            (10, 1000, 100),            #43     # 4.3 Very large (thousands, 10s of thousands)
+            (10, 10000, 1),             #44     # 4.4 Extreme ratios between height/width
+            (32, 32, 64),               #45     # 4.1 Divisible by 32
+            (64, 160, 96),              #46     # 4.1 Divisible by 32
+            (11, 17, 41),               #47     # 4.2 Prime numbers
+            (13, 89, 3),                #48     # 4.2 Prime numbers
+
+            # 4-dimensional shape, microbatch_size = 1:
+            (1, 2, 3, 4),               #49     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 11, 45, 17),            #50     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 11, 1, 23),             #51     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 11, 64, 1),             #52     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100, 100, 100),         #53     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 10, 1000, 100),         #54     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1, 10, 1000),           #55     # 4.4 Extreme ratios between height/width
+            (1, 1, 9920, 1),            #56     # 4.4 Extreme ratios between height/width
+            (1, 10, 10000, 1),          #57     # 4.4 Extreme ratios between height/width
+            (1, 32, 32, 64),            #58     # 4.1 Divisible by 32
+            (1, 64, 160, 96),           #59     # 4.1 Divisible by 32
+            (1, 11, 17, 41),            #60     # 4.2 Prime numbers
+            (1, 13, 89, 3),             #61     # 4.2 Prime numbers
+
+            # 4-dimensional shape, microbatch_size > 1:
+            (3, 11, 45, 17),                  #62     # 3.1 Full tensor (i.e. full expected shape)
+            (2, 2, 3, 4),                     #63     # 3.1 Full tensor (i.e. full expected shape)
+            (4, 11, 1, 23),                   #64     # 3.2 Tensor reduce on one or more dims to 1
+            (5, 11, 64, 1),                   #65     # 3.2 Tensor reduce on one or more dims to 1
+            (6, 100, 100, 100),               #66     # 4.3 Very large (thousands, 10s of thousands)
+            (7, 10, 1000, 100),               #67     # 4.3 Very large (thousands, 10s of thousands)
+            (8, 1, 10, 1000),                 #68     # 4.4 Extreme ratios between height/width
+            (9, 1, 9920, 1),                  #69     # 4.4 Extreme ratios between height/width
+            (10, 10, 10000, 1),               #70     # 4.4 Extreme ratios between height/width
+            (11, 32, 32, 64),                 #71     # 4.1 Divisible by 32
+            pytest.param((12, 64, 160, 96),   #72     # 4.1 Divisible by 32
+                         marks=pytest.mark.skip(reason="RuntimeError: Fatal Python error: Segmentation fault")),
+            (13, 11, 17, 41),                 #73     # 4.2 Prime numbers
+            (14, 13, 89, 3),                  #74     # 4.2 Prime numbers
+    ]
+
+
+@pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
+@pytest.mark.parametrize("input_model", 
+    [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if item.startswith("model") and not item.__contains__("prologued")]
+)
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+def test_eltwise_binary_ops_per_test_plan(
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    dev_data_format=None, 
+    input_math_fidelity=None
+):
+    s = get_input_shapes()
+    
+    # Observed Bugs: --------------------------------------------------------------------------------------------------------------------
+    # 1. input_shape in ((1, 1000, 100), (10, 1000, 100)):
+    if input_model == "model_op_src_from_tm_edge1" and input_operator == "Heaviside" and input_shape in (s[30], s[43]):
+        pytest.xfail(reason="RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/policies/policy_utils.cpp:2221: " + 
+                            "graph ->get_edges( graph->get_node_by_name(nopInsertInst->src), " +
+                            "graph->get_node_by_name(nopInsertInst->dest)) .size() == 1")
+    # 2. input_shape in ((1, 9920, 1), (1, 1, 9920, 1), (9, 1, 9920, 1)):
+    if input_model == "model_op_src_from_another_op" and input_operator in ["Equal", "NotEqual"] and input_shape in (s[32], s[56], s[69]):
+        pytest.xfail(reason="RuntimeError: Fatal balancer error: Could not reconcile constraints: path[Add0 -> _fused_op_0]")
+    # ------------------------------------------------------------------------------------------------------------------------------------
+
+
+    input_source_flag = None
+    if input_model == "model_op_src_from_dram_queue":
+        input_source_flag = InputSourceFlags.FROM_DRAM
+
+    verify(
+        test_device=test_device,
+        input_model=input_model,
+        input_operator=input_operator,
+        input_shape=input_shape,
+        number_of_operands=2,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=input_math_fidelity,
+    )
+
+    # netlist validations:
+
+    file_path = VerifyUtils.get_netlist_filename()
+
+    if input_model == "model_op_src_from_dram_queue":
+        assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
+        assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+
+    if input_model == "model_op_src_const_eval_pass":
+        # Here we check there is no key with operator name in the netlist in graphs section
+        d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+        for key in d.keys():
+            assert input_operator not in key
+
+
+def get_eltwise_binary_ops_prologued():
+    return [
+        pytest.param("Add"),              #00
+        pytest.param("Max"),              #01
+        pytest.param("Min"),              #02
+        pytest.param("Power",             #03
+                     marks=pytest.mark.xfail(reason="AssertionError: Data mismatch detected")),
+        pytest.param("Subtract"),         #04
+        pytest.param("Multiply"),         #05
+        pytest.param("Heaviside"),        #06
+        pytest.param("Greater"),          #07
+        pytest.param("GreaterEqual"),     #08
+        pytest.param("Less"),             #09
+        pytest.param("LessEqual"),        #10
+        pytest.param("Equal"),            #11
+        pytest.param("NotEqual"),         #12
+    ]
+
+def get_input_shapes_prologued():
+    # Columns: input_shape, input_source_flag, should_prolog"
+    return [
+            # 2-dimensional shape, microbatch_size = 1:
+            ((1, 16),        InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #00        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 17),        InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #01        # 3.1 Full tensor (i.e. full expected shape)
+            
+            # 2-dimensional shape, microbatch_size > 1:
+            pytest.param((4, 16), InputSourceFlags.FROM_DRAM_PROLOGUED, True,              #02        # 3.1 Full tensor (i.e. full expected shape)
+                    marks=pytest.mark.xfail(reason="Doesn't work for microbatchsize > 1 and two dimensions.")),
+            pytest.param((3, 17), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False,         #03        # 3.1 Full tensor (i.e. full expected shape)
+                    marks=pytest.mark.xfail(reason="Doesn't work for microbatchsize > 1 and two dimensions.")),
+            
+            # 3-dimensional shape:
+            ((2, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #04        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #05        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #06        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #07        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #08        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #09 !!!    # 3.1 Full tensor (i.e. full expected shape) - not according to documentation!
+            ((2, 10, 5),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #10        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 15),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #11        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 50, 1),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #12        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 100, 100),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #13        # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 100, 1000), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #14        # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 10000),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #15        # 4.4 Extreme ratios between height/width
+            ((2, 10000, 1),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #16        # 4.4 Extreme ratios between height/width
+            ((2, 32, 32),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #17        # 4.1 Divisible by 32
+            ((2, 96, 96),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #18        # 4.1 Divisible by 32
+            ((2, 13, 97),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #19        # 4.2 Prime numbers
+            
+            # 4-dimensional shape, microbatch_size = 1:
+            ((1, 2, 3, 4),   InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #20        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 17, 13, 4), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #21        # 3.1 Full tensor (i.e. full expected shape)
+            
+            # 4-dimensional shape, microbatch_size > 1:
+            ((2, 2, 3, 4),   InputSourceFlags.FROM_DRAM_PROLOGUED, True),                  #22        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 17, 13, 4), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #23        # 3.1 Full tensor (i.e. full expected shape)
+            ]
+
+
+@pytest.mark.parametrize("input_operator", get_eltwise_binary_ops_prologued())
+@pytest.mark.parametrize("input_model", ["model_op_src_from_dram_queue_prologued"])
+@pytest.mark.parametrize("input_shape, input_source_flag, should_prolog", get_input_shapes_prologued())
+def test_eltwise_binary_ops_per_test_plan_dram_prologued(
+    input_operator,
+    input_model,
+    input_shape,
+    input_source_flag,
+    should_prolog,
+    test_device,
+    dev_data_format=None,
+    input_math_fidelity=None
+):
+
+    verify(
+        test_device=test_device,
+        input_model=input_model,
+        input_operator=input_operator,
+        input_shape=input_shape,
+        number_of_operands=1,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=input_math_fidelity,
+    )
+
+    # netlist validation:
+    file_path = VerifyUtils.get_netlist_filename()
+    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_" + input_operator + "0")
+    if should_prolog:
+        assert d['prologue']
+    else:
+        assert not d['prologue']
+
+
+# Operand Data Format (DF) and Math Fidelity (MF)
+# We will not test all combinations of Data Format and Math Fidelity
+# because it would be too much tests. 
+# Also, we will test DF and MF by fixing single shape.
+#
+#   1. First we will choose Data Format to be Float16_b and test all Math Fidelity values
+#   2. Then we will set Math Fidelity to HiFi4 and test all Data Formats. 
+
+### 1. ####################################################################################
+
+
+def get_single_shape(microbatch_size=1):
+    return (microbatch_size, 3, 3)        # Full tensor, small size
+
+#   5.4 Operand DFs
+
+dev_data_formats = [
+    pybuda.DataFormat.Float16_b,
+]
+
+#  6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+compiler_math_fidelity = [
+                            pybuda.MathFidelity.LoFi,       #00
+                            pybuda.MathFidelity.HiFi2,      #01
+                            pybuda.MathFidelity.HiFi3,      #02
+                            pybuda.MathFidelity.HiFi4,      #03
+                         ]
+
+
+@pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
+@pytest.mark.parametrize("input_model", ["model_op_src_from_another_op"])
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_mf_eltwise_binary_ops_per_test_plan(input_operator, input_model, test_device, dev_data_format, math_fidelity):
+    test_eltwise_binary_ops_per_test_plan(
+        input_operator,
+        input_model,
+        get_single_shape(),
+        test_device,
+        dev_data_format,
+        math_fidelity,
+    )
+
+
+### 2. ####################################################################################
+
+#   5.4 Operand DFs
+
+dev_data_formats=[
+    pybuda.DataFormat.Bfp2,         #00
+    pybuda.DataFormat.Bfp2_b,       #01
+    pybuda.DataFormat.Bfp4,         #02
+    pybuda.DataFormat.Bfp4_b,       #03
+    pybuda.DataFormat.Bfp8,         #04
+    pybuda.DataFormat.Bfp8_b,       #05
+    pybuda.DataFormat.Float16,      #06
+    pybuda.DataFormat.Float16_b,    #07
+    pybuda.DataFormat.Float32,      #08
+    pybuda.DataFormat.Int8,         #09
+    pybuda.DataFormat.Lf8,          #10
+    pybuda.DataFormat.RawUInt16,    #11
+    pybuda.DataFormat.RawUInt32,    #12
+    pybuda.DataFormat.RawUInt8,     #13
+    pybuda.DataFormat.UInt16,       #14
+]
+
+#  6. Math fidelity
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
+
+
+@pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
+@pytest.mark.parametrize("input_model", ["model_op_src_from_another_op"])
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_df_eltwise_binary_ops_per_test_plan(input_operator, input_model, test_device, dev_data_format, math_fidelity):
+    test_eltwise_binary_ops_per_test_plan(
+        input_operator,
+        input_model,
+        get_single_shape(),
+        test_device,
+        dev_data_format,
+        math_fidelity,
+    )
+
+
+# ------------------------------------------------------------------------------------------------------------
+# Old test implementation using not simplified test models:
+# (These old tests are deactivated)
+# ------------------------------------------------------------------------------------------------------------
 
 MODELS_PATH = "./pybuda/test/operators/eltwise_binary/models/"
 
@@ -54,7 +511,7 @@
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("mode", ["Inference"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-def test_eltwise_binary(
+def obsoleted_test_eltwise_binary(
     mode,
     recompute,
     operation,

From d73f2acbfad7fdf30295ffd75c66b4512f48026b Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 17 Jul 2024 15:47:10 +0000
Subject: [PATCH 038/116] NetlistValidation utils

Issue #2554 / #2787

(cherry picked from commit 31462919e7fb502093ae4dc3ab5159ccb846ef2b)
---
 pybuda/test/operators/nary/test_stack.py | 17 +++++++++--------
 pybuda/test/operators/utils/__init__.py  |  2 ++
 pybuda/test/operators/utils/utils.py     | 23 +++++++++++++++++++++++
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/pybuda/test/operators/nary/test_stack.py b/pybuda/test/operators/nary/test_stack.py
index 3540b6c9..1ca5c6da 100644
--- a/pybuda/test/operators/nary/test_stack.py
+++ b/pybuda/test/operators/nary/test_stack.py
@@ -66,8 +66,9 @@
 
 from pybuda import PyBudaModule
 from pybuda.op_repo import TensorShape
-from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import InputSourceFlags, CompilerUtils, VerifyUtils
 from test.operators.utils import ShapeUtils
+from test.operators.utils import NetlistValidation
 from test.conftest import TestDevice
 
 
@@ -322,9 +323,9 @@ def forward(self, x, y):
         math_fidelity=math_fidelity,
     )
 
-    file_path = VerifyUtils.get_netlist_filename()
-    assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
-    assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+    netlist = NetlistValidation()
+    assert netlist.get_value("/queues/x/loc") == 'dram'
+    assert netlist.get_value("/queues/y/loc") == 'dram'
 
 
 def get_input_shapes_prologued():
@@ -384,8 +385,8 @@ def forward(self, x):
         math_fidelity=math_fidelity,
     )
 
-    file_path = VerifyUtils.get_netlist_filename()
-    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Stack0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Stack0")
     if should_prolog:
         assert d['prologue']
     else:
@@ -435,8 +436,8 @@ def forward(self, x, y):
     )
 
     # Here we check there is no key with "Stack" in the netlist in graphs section
-    file_path = VerifyUtils.get_netlist_filename()
-    d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
     for key in d.keys():
         assert "Stack" not in key
 
diff --git a/pybuda/test/operators/utils/__init__.py b/pybuda/test/operators/utils/__init__.py
index d9c3b26f..a925e631 100644
--- a/pybuda/test/operators/utils/__init__.py
+++ b/pybuda/test/operators/utils/__init__.py
@@ -6,6 +6,7 @@
 from .utils import InputSourceFlag, InputSourceFlags
 from .utils import CompilerUtils
 from .utils import VerifyUtils
+from .utils import NetlistValidation
 from .utils import LoggerUtils
 from .netlist_utils import read_netlist_value
 
@@ -16,5 +17,6 @@
     'InputSourceFlags',
     'CompilerUtils',
     'VerifyUtils',
+    'NetlistValidation',
     'LoggerUtils',
 ]
diff --git a/pybuda/test/operators/utils/utils.py b/pybuda/test/operators/utils/utils.py
index 9ce1f6d7..47b09e9a 100644
--- a/pybuda/test/operators/utils/utils.py
+++ b/pybuda/test/operators/utils/utils.py
@@ -19,6 +19,8 @@
 from pybuda._C import MathFidelity
 from test.conftest import TestDevice
 
+from ..utils import netlist_utils
+
 
 class ShapeUtils:
 
@@ -96,6 +98,27 @@ def get_netlist_filename() -> str:
         '''Get netlist filename of the last compiled model'''
         return pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
 
+
+class NetlistValidation:
+    '''Utility functions for netlist validation'''
+
+    def __init__(self):
+        self.netlist_filename = VerifyUtils.get_netlist_filename()
+
+    def get_value(self, key_path: str):
+        """
+        Reads a netlist value from a YAML file based on the given key path.
+
+        Args:
+            key_path (str): The key path to the desired value in the YAML file.
+                            Keys are separated by slashes ("/").
+
+        Returns:
+            The value corresponding to the given key path in the YAML file, or None if the key path is not found.
+        """
+        return netlist_utils.read_netlist_value(self.netlist_filename, key_path)
+
+
 class LoggerUtils:
     '''Utility functions for logging'''
 

From e445af7582fd0da827bf1a1f59d3113848f4db9c Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Thu, 18 Jul 2024 07:59:55 +0000
Subject: [PATCH 039/116] Decompose downsample 2d for non-square shape and add
 channel last support

(cherry picked from commit 4475e7a5c37c45a472f453efdf6bbb143848e5a9)
---
 pybuda/pybuda/op/eval/pybuda/resize.py        | 37 ++++++--
 pybuda/pybuda/op/eval/sparse_utils.py         | 45 +++-------
 .../test/tvm/sanity/tests_C/test_decomps.py   | 84 +++++++++++++++++++
 3 files changed, 125 insertions(+), 41 deletions(-)

diff --git a/pybuda/pybuda/op/eval/pybuda/resize.py b/pybuda/pybuda/op/eval/pybuda/resize.py
index ce35016f..7deef3c3 100644
--- a/pybuda/pybuda/op/eval/pybuda/resize.py
+++ b/pybuda/pybuda/op/eval/pybuda/resize.py
@@ -290,25 +290,48 @@ def decompose_downsample_2d(attr, dc, inputs, resize_method):
     if channel_last:
         cin = shape[-1]
         scale_factor = shape[-3] // attr[0]
+        # Transpose the activation to have channels as the first dimension.
+        # (N, H, W, C) -> transpose(-3, -1) -> (N, C, W, H)
         activations = dc.op(TransposeTM.create(-3, -1), [activations])
+
+        # (N, C, W, H) -> transpose(-2, -1) -> (N, C, H, W)
         activations = dc.op(TransposeTM.create(-2, -1), [activations])
     else:
         cin = shape[-3]
         scale_factor = shape[-2] // attr[0]
     
     if resize_method == "nearest":
-        dident = create_nearest_neighbor_downsample_picker_matrix(scale_factor, shape, channel_last=channel_last)
+        dident_1 = create_nearest_neighbor_downsample_picker_matrix(scale_factor, activations.shape)
     else:
         raise NotImplementedError("Only nearest neighbor downsample is supported")
+
+    dident_1 = dident_1.unsqueeze(0).unsqueeze(0)
+    dident_1 = torch.cat([dident_1]*cin, dim=-3)
+    dident_1_tensor = dc.tensor(dident_1)
     
-    dident = dident.unsqueeze(0).unsqueeze(0)
-    dident = torch.cat([dident]*cin, dim=-3)
-    
-    dident_tensor = dc.tensor(dident)
-    result = dc.op("sparse_matmul", [dident_tensor, activations])
+    # Eg: activation_shape = (1, 3, 4, 8)
+    #     scale_factor = 2
+    #     channel_last = False
+
+    #   matmul_1    =   dident_1_tensor     x       activations
+    # (1, 3, 2, 8)  =     (1, 3, 2, 4)      x       (1, 3, 4, 8)
+    result = dc.op("sparse_matmul", [dident_1_tensor, activations])
+
+    #  transpose_1     =    matmul_1.tranpose(-2, -1)
+    #  (1, 3, 8, 2)  <---   (1, 3, 2, 8)
     result = dc.op(TransposeTM.create(-2, -1), [result])
-    result = dc.op("sparse_matmul", [dident_tensor, result]) # z, x ,y
+    
+    dident_2 = create_nearest_neighbor_downsample_picker_matrix(scale_factor, result.shape)
+    dident_2 = dident_2.unsqueeze(0).unsqueeze(0)
+    dident_2 = torch.cat([dident_2]*cin, dim=-3)
+    dident_2_tensor = dc.tensor(dident_2)
+
+    #   matmul_2    =   dident_2_tensor     x       transpose_1
+    # (1, 3, 4, 2)  =    (1, 3, 4, 8)       x      (1, 3, 8, 2)
+    result = dc.op("sparse_matmul", [dident_2_tensor, result]) # z, x ,y
 
+    #  transpose_2     =    matmul_2.tranpose(-2, -1)
+    #  (1, 3, 2, 4)   <---   (1, 3, 4, 2)
     if channel_last:
         result = dc.op(TransposeTM.create(-3, -1), [result]) # y, x, z
     else:
diff --git a/pybuda/pybuda/op/eval/sparse_utils.py b/pybuda/pybuda/op/eval/sparse_utils.py
index bc7d8971..f94fd596 100644
--- a/pybuda/pybuda/op/eval/sparse_utils.py
+++ b/pybuda/pybuda/op/eval/sparse_utils.py
@@ -713,42 +713,19 @@ def create_nearest_neighbor_upsample_picker_matrix(
         )
 
 def create_nearest_neighbor_downsample_picker_matrix(
-    scale_factor, shape, tile_align=False, channel_last=False,
+    scale_factor, shape, tile_align=False,
 ):
-    if channel_last:
-        rows = torch.arange((shape[-3] // scale_factor) * (shape[-2] // scale_factor))
-        rows = scale_factor * (rows // scale_factor)
-        cols = []
-        for i in range(shape[-3]):
-            col = (
-                torch.arange(shape[-2]).repeat_interleave(scale_factor).repeat(scale_factor)
-                + i * align_up_tile(shape[-2])
-            )
-            cols.append(col)
-
-        cols = torch.concat(cols)
-
-        sparse_r = rows.shape[0]
-        sparse_c = align_up_tile(shape[-2]) * shape[-3]
-        if tile_align:
-            sparse_r = align_up_tile(sparse_r)
-            sparse_c = align_up_tile(sparse_c)
-
-        return torch.sparse_coo_tensor(
-            [rows.tolist(), cols.tolist()], torch.ones(cols.shape[0]), (sparse_r, sparse_c)
-        )
-    else:
-        cols = torch.arange(shape[-2] // scale_factor)*scale_factor
-        rows = cols // scale_factor
-        sparse_r = cols.shape[0]
-        sparse_c = shape[-1]
-        if tile_align:
-            sparse_r = align_up_tile(sparse_r)
-            sparse_c = align_up_tile(sparse_c)
+    cols = torch.arange(shape[-2] // scale_factor)*scale_factor
+    rows = cols // scale_factor
+    sparse_r = cols.shape[0]
+    sparse_c = shape[-2]
+    if tile_align:
+        sparse_r = align_up_tile(sparse_r)
+        sparse_c = align_up_tile(sparse_c)
 
-        return torch.sparse_coo_tensor(
-            [rows.tolist(), cols.tolist()], torch.ones(cols.shape[0]), (sparse_r, sparse_c)
-        )
+    return torch.sparse_coo_tensor(
+        [rows.tolist(), cols.tolist()], torch.ones(cols.shape[0]), (sparse_r, sparse_c)
+    )
 
 
 def create_bilinear_upsample_picker_matrix(
diff --git a/pybuda/test/tvm/sanity/tests_C/test_decomps.py b/pybuda/test/tvm/sanity/tests_C/test_decomps.py
index 8ff9cbb3..57ec682f 100644
--- a/pybuda/test/tvm/sanity/tests_C/test_decomps.py
+++ b/pybuda/test/tvm/sanity/tests_C/test_decomps.py
@@ -821,6 +821,90 @@ def forward(self, x):
         verify_cfg=VerifyConfig(),
     )
 
+
+input_shapes = [list((1, 3, dim, dim*2)) for dim in range(4, 20)]
+scale_factors = list(range(2, 10))
+@pytest.mark.parametrize("input_shape", input_shapes)
+@pytest.mark.parametrize("scale_factor", scale_factors)
+def test_downsample_2d_nearest_channel_first_pytorch(test_device, input_shape, scale_factor):
+
+    if input_shape[-1] % scale_factor != 0 or input_shape[-2] % scale_factor != 0:
+        pytest.skip("input_shape must be divisible by scale_factor")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    class downsample_2d_model(torch.nn.Module):
+        def __init__(self, scale_factor):
+            super().__init__()
+            self.scale_factor = scale_factor
+
+        def forward(self, input_tensor):
+            return torch.nn.functional.interpolate(input_tensor, scale_factor=1/self.scale_factor, mode='nearest')
+
+    model = downsample_2d_model(scale_factor=scale_factor)
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(
+        "pt_downsample_2d", model
+    )
+
+    input_sample = torch.rand(input_shape)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(input_sample.shape,)],
+        inputs=[(input_sample,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+        ),
+    )
+
+
+input_shapes = [list((1, dim, dim*2, 3)) for dim in range(4, 20)]
+scale_factors = list(range(2, 10))
+@pytest.mark.parametrize("input_shape", input_shapes)
+@pytest.mark.parametrize("scale_factor", scale_factors)
+def test_downsample_2d_nearest_channel_last_pytorch(test_device, input_shape, scale_factor):
+
+    if input_shape[1] % scale_factor != 0 or input_shape[2] % scale_factor != 0:
+        pytest.skip("input_shape must be divisible by scale_factor")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    class Downsample2d(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, input):
+            return pybuda.op.Resize2d("", input, sizes=[input_shape[1] // scale_factor, input_shape[2] // scale_factor], method="nearest_neighbor", channel_last=True)
+
+    model = Downsample2d("Downsample2d_channel_last")
+
+
+    verify_module(
+            model,
+            (input_shape,),
+            verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            )
+    )
+
+
 def get_factorization(n):
     factors = []
     i = 2

From 603982e7ec66e745a668b758e2f3740e848cbcf3 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 19 Jul 2024 14:33:36 +0000
Subject: [PATCH 040/116] Quantize-Dequantize Support

(cherry picked from commit a1eb38b18ac9a0cbda3b870b2e39316e4f3c3214)
---
 pybuda/csrc/buda_passes.cpp                   |  29 +-
 pybuda/csrc/passes/commute_utils.cpp          | 148 ++++++-
 pybuda/csrc/passes/commute_utils.hpp          |  16 +-
 pybuda/csrc/passes/consteval.cpp              |  14 +-
 pybuda/csrc/passes/dataformat.cpp             |   2 +-
 .../csrc/passes/dequant_quant_to_requant.cpp  | 107 +++++
 .../csrc/passes/dequant_quant_to_requant.hpp  |  14 +
 pybuda/csrc/passes/erase_inverse_ops.cpp      |  34 +-
 ...nsert_inverse_outside_quantized_region.cpp | 161 ++++++++
 ...nsert_inverse_outside_quantized_region.hpp |  17 +
 pybuda/csrc/passes/insert_qdq_on_biases.cpp   | 267 +++++++++++++
 pybuda/csrc/passes/insert_qdq_on_biases.hpp   |  14 +
 pybuda/csrc/passes/make_quantized_ops.cpp     | 365 ++++++++++++++++++
 pybuda/csrc/passes/make_quantized_ops.hpp     |  14 +
 pybuda/csrc/passes/move_dequantize.cpp        | 356 +++++++++++++++++
 pybuda/csrc/passes/move_dequantize.hpp        |  14 +
 pybuda/csrc/passes/move_requantize.cpp        |   2 +-
 pybuda/csrc/passes/remove_quant_dequant.cpp   |  96 +++++
 pybuda/csrc/passes/remove_quant_dequant.hpp   |  14 +
 pybuda/pybuda/op/eval/buda/matmul.py          |   5 +
 pybuda/pybuda/op/eval/buda/quantize.py        |   6 +-
 pybuda/pybuda/op/eval/buda/tm.py              |   4 +-
 pybuda/pybuda/op/eval/pybuda/matmul.py        |   1 +
 pybuda/pybuda/op/eval/pybuda/quantize.py      | 133 ++++---
 pybuda/pybuda/op/eval/pybuda/tm.py            |  10 +-
 pybuda/pybuda/python_codegen.py               |   2 +-
 pybuda/pybuda/tvm_to_python.py                |  34 +-
 .../test/quantized/test_onnx_qdq_commute.py   | 114 ++++++
 .../test_onnx_quantized_mobilenet.py          |  50 ++-
 .../quantized/test_onnx_quantized_resnet.py   |  46 +++
 .../test/quantized/test_onnx_quantized_vit.py |  50 +++
 31 files changed, 2061 insertions(+), 78 deletions(-)
 create mode 100644 pybuda/csrc/passes/dequant_quant_to_requant.cpp
 create mode 100644 pybuda/csrc/passes/dequant_quant_to_requant.hpp
 create mode 100644 pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
 create mode 100644 pybuda/csrc/passes/insert_inverse_outside_quantized_region.hpp
 create mode 100644 pybuda/csrc/passes/insert_qdq_on_biases.cpp
 create mode 100644 pybuda/csrc/passes/insert_qdq_on_biases.hpp
 create mode 100644 pybuda/csrc/passes/make_quantized_ops.cpp
 create mode 100644 pybuda/csrc/passes/make_quantized_ops.hpp
 create mode 100644 pybuda/csrc/passes/move_dequantize.cpp
 create mode 100644 pybuda/csrc/passes/move_dequantize.hpp
 create mode 100644 pybuda/csrc/passes/remove_quant_dequant.cpp
 create mode 100644 pybuda/csrc/passes/remove_quant_dequant.hpp
 create mode 100644 pybuda/test/quantized/test_onnx_qdq_commute.py

diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index b2c8ad99..39cac9f2 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -17,6 +17,7 @@
 #include "passes/decomposing_context.hpp"
 #include "passes/erase_consecutive_reshape.hpp"
 #include "passes/erase_inverse_ops.hpp"
+#include "passes/insert_inverse_outside_quantized_region.hpp"
 #include "passes/erase_unnecessary_4d_tm_sequence.hpp"
 #include "passes/explicate_unsqueeze.hpp"
 #include "passes/fork_join.hpp"
@@ -34,7 +35,12 @@
 #include "passes/lower_concat_to_runtime_transform.hpp"
 #include "passes/lower_reinterpret_shape.hpp"
 #include "passes/lowering_context.hpp"
+#include "passes/move_dequantize.hpp"
 #include "passes/move_requantize.hpp"
+#include "passes/remove_quant_dequant.hpp"
+#include "passes/insert_qdq_on_biases.hpp"
+#include "passes/dequant_quant_to_requant.hpp"
+#include "passes/make_quantized_ops.hpp"
 #include "passes/move_select_after_matmul_optional.hpp"
 #include "passes/pad_output_buffer.hpp"
 #include "passes/passes_utils.hpp"
@@ -92,6 +98,18 @@ run_post_initial_graph_passes(graphlib::Graph *graph, py::object compiler_cfg_ob
 
     passes::print_graph(graph, "INITIAL");
     passes::generate_initial_flops_estimate(graph);
+    // These passes must be run in a loop as its possible that after
+    // Pushing a dequant through a conv/matmul/etc it can be moved down further
+    bool attempt_update = true;
+    while (attempt_update) {
+        attempt_update = passes::move_dequantize(graph);
+        attempt_update |= passes::make_quantized_ops(graph);
+        attempt_update |= passes::insert_qdq_on_biases(graph);
+        attempt_update |= passes::dequant_quant_to_requant(graph);
+    }
+    
+    passes::remove_quant_dequant(graph);
+    reportify::dump_graph(graph->name(), "post_quantize_commute", graph);
     passes::decompose_nd_reshape_split(graph);
     passes::limit_to_4d_reshape(graph);
     passes::erase_unnecessary_4d_tm_sequence(graph);
@@ -161,6 +179,15 @@ void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &d
             passes::bypass_nop_tms(graph);
         }
     }
+
+    // Move TMs outside of quantized graph regions
+    // attempt_update = true;
+    // while(attempt_update) {
+    //     passes::insert_inverse_outside_quantized_region(graph);
+    //     attempt_update = passes::erase_inverse_ops(graph);
+    // }
+    
+
     passes::move_tm_through_requantize(graph);
     recalculate_shapes(graph);
 
@@ -177,7 +204,6 @@ void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &d
     passes::move_select_after_matmul_optional(graph);
 
     passes::fuse_tm_sequences(graph);
-    reportify::dump_graph(graph->name(), "post_erase_inverse_ops", graph);
 }
 
 std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>> run_post_optimize_decompose_graph_passes(
@@ -396,7 +422,6 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
         fracture_chip_id_assignments,
         "" /* nops_remote_devices_postfix */,
         use_interactive_placer);
-
     return std::make_pair(std::move(lowered_graph), placer_config_update);
 }
 
diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index d01e8dfa..338b0a90 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -797,17 +797,151 @@ bool commute_through_eltwise(
     return true;
 }
 
-bool commute_through_quantization(
+bool commute_through_squeeze(
     graphlib::OpNode* op,
-    graphlib::Shape *commute_shape,
-    graphlib::OpType *golden_transform)
+    graphlib::OpNode* initial_op,
+    graphlib::Shape* commute_shape,
+    graphlib::Shape* clone_shape,
+    graphlib::OpType* golden_transform,
+    bool commute_up,
+    bool check_only) 
+{   
+    TT_ASSERT(op->op_name() == "squeeze", "Op is not a squeeze op");
+    if (commute_up)
+        return false;
+
+    // Only commute transpose through squeeze for now
+    if (initial_op->op_name() != "transpose")
+        return false;
+
+    std::vector<graphlib::OpType::Attr> op_attrs = op->op_attrs();
+
+    // Commute only if squeeze dim is 0 for now
+    if (std::get<int>(op_attrs[0]) != 0)
+        return false;
+
+    if ((*commute_shape)[0] != 1)
+        return false;
+
+    if (check_only)
+        return true;
+
+    auto updated_commute_shape = commute_shape->as_vector();
+    updated_commute_shape.erase(updated_commute_shape.begin());
+    *commute_shape = graphlib::Shape::create(updated_commute_shape);
+    op->set_shape(*commute_shape);
+    op->add_golden_transform(*golden_transform);
+
+    auto updated_clone_shape = clone_shape->as_vector();
+    updated_clone_shape.erase(updated_clone_shape.begin());
+    *clone_shape = graphlib::Shape::create(updated_clone_shape);
+
+    return true;
+}
+
+bool can_commute_through_squeeze(
+    graphlib::OpNode* op,
+    graphlib::OpNode* initial_op,
+    graphlib::Shape* commute_shape,
+    graphlib::Shape* clone_shape,
+    bool commute_up) 
+{
+    return commute_through_squeeze(op, initial_op, commute_shape, clone_shape, nullptr, commute_up, true);
+}
+
+
+bool commute_through_quantization(
+    graphlib::OpNode* op, 
+    graphlib::OpNode* initial_op,
+    bool check_only,
+    graphlib::Shape* commute_shape,
+    graphlib::OpType* golden_transform,
+    bool commute_up)
 {
     TT_ASSERT(is_quantization_ops(op), "op must be an quantization op");
+    if (commute_up)
+        return false;
+
+    int axis = std::get<int>(op->op_attrs()[1]);
+    int new_axis = axis;
+    bool can_commute = false;
+
+    if (initial_op->op_type().op == "reshape") {
+
+        if (not commute_up) {
+            // axis of quantization must have the same volume to the left and right of it
+            
+            if (new_axis < 0)
+                new_axis += op->shape().size();
+
+            // check if axis moved to the right (or in the same place)
+            while (new_axis < (int)commute_shape->size()) {
+                if ((*commute_shape)[new_axis] == op->shape()[axis]) {
+                    if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
+                        and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
+                        can_commute = true;
+                    }
+                    break;
+                }
+                new_axis++;
+            }
+            if (not can_commute) {
+                new_axis = axis-1;
+                while (new_axis >= 0) {
+                    if ((*commute_shape)[new_axis] == op->shape()[axis]) {
+                        if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
+                            and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
+                            can_commute = true;
+                        }
+                        break;
+                    }
+                    new_axis--;
+                }
+            }
+        }
+    } 
+    else if (initial_op->op_type().op == "transpose") 
+    {   
+        can_commute = true;
+        if (new_axis < 0)
+            new_axis += op->shape().size();
+        
+        int transpose_dim0 = initial_op->op_type().get_attr_as<int>("dim0");
+        int transpose_dim1 = initial_op->op_type().get_attr_as<int>("dim1");
+        if (transpose_dim0 < 0)
+            transpose_dim0 += commute_shape->size();
+
+        if (transpose_dim1 < 0)
+            transpose_dim1 += commute_shape->size();
+
+        if (new_axis == transpose_dim0)
+            new_axis = transpose_dim1;
+        else if (new_axis == transpose_dim1)
+            new_axis = transpose_dim0;
+    }
+        
+    if (check_only)
+        return can_commute;
+
+    TT_ASSERT(can_commute, "Should not have called this if it is incommutable.");
+
+    std::vector<graphlib::OpType::Attr> op_attrs = op->op_attrs();
+    op_attrs[1] = new_axis;
     op->set_shape(*commute_shape);
+    op->overwrite_op_attrs(op_attrs);
     op->add_golden_transform(*golden_transform);
     return true;
 }
 
+bool can_commute_through_quantize(
+    graphlib::OpNode* op, 
+    graphlib::OpNode* initial_op,
+    graphlib::Shape* commute_shape,
+    bool commute_up)
+{
+    return commute_through_quantization(op, initial_op, true, commute_shape, nullptr, commute_up);
+}
+
 bool is_elementwise(graphlib::OpNode *op)
 {
     py::object eval_module = py::module_::import("pybuda.op.eval.pybuda");
@@ -846,8 +980,14 @@ bool can_commute_past_op(
         bool can_commute = can_commute_through_select(graph, op, initial_op, producer, commute_shape, clone_shape, commute_up);
         return can_commute;
     }
+    else if (is_quantization_ops(op)) {
+        bool can_commute = can_commute_through_quantize(op, initial_op, commute_shape, commute_up);
+        return can_commute;
+    } else if (op->op_name() == "squeeze") {
+        return can_commute_through_squeeze(op, initial_op, commute_shape, clone_shape, commute_up);
+    }
 
-    return (is_elementwise(op) and op->op_name() != "interleave") or is_quantization_ops(op);
+    return (is_elementwise(op) and op->op_name() != "interleave");
 }
 
 void update_reshape_attr(graphlib::OpNode *reshape, graphlib::Shape new_shape)
diff --git a/pybuda/csrc/passes/commute_utils.hpp b/pybuda/csrc/passes/commute_utils.hpp
index 504e1592..4deb00bc 100644
--- a/pybuda/csrc/passes/commute_utils.hpp
+++ b/pybuda/csrc/passes/commute_utils.hpp
@@ -123,9 +123,21 @@ bool commute_through_eltwise(
     graphlib::OpType *golden_transform=nullptr);
 
 bool commute_through_quantization(
+    graphlib::OpNode* op, 
+    graphlib::OpNode* initial_op,
+    bool check_only,
+    graphlib::Shape* commute_shape,
+    graphlib::OpType* golden_transform=nullptr,
+    bool commute_up=false);
+
+bool commute_through_squeeze(
     graphlib::OpNode* op,
-    graphlib::Shape *commute_shape=nullptr,
-    graphlib::OpType *golden_transform=nullptr);
+    graphlib::OpNode* initial_op,
+    graphlib::Shape* commute_shape,
+    graphlib::Shape* clone_shape,
+    graphlib::OpType* golden_transform,
+    bool commute_up,
+    bool check_only);
 
 bool is_elementwise(graphlib::OpNode *op);
 bool is_quantization_ops(graphlib::OpNode *op);
diff --git a/pybuda/csrc/passes/consteval.cpp b/pybuda/csrc/passes/consteval.cpp
index 1ffc777d..b030293e 100644
--- a/pybuda/csrc/passes/consteval.cpp
+++ b/pybuda/csrc/passes/consteval.cpp
@@ -38,10 +38,20 @@ static bool input_can_consteval(graphlib::Graph *graph, graphlib::InputNode *inp
         return op->op_name() == "broadcast" or op->op_name() == "repeat" or op->op_name() == "repeat_dim";
     };
 
+    // We want to go from weights->quantize->dequantize to quantized_weights->dequantize
+    // without constevaling dequantize, as it would cancel the quantize op
+    auto is_dequantize = [](graphlib::Node *node) {
+        graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
+        if (not op)
+            return false;
+
+        return op->op_name() == "dequantize" or op->op_name() == "buda_dequantize";
+    };
+
     TT_ASSERT(graphlib::is_consteval_capable_input_type(input));
     std::vector<graphlib::Node *> users = graph->data_users(input);
-    auto user_can_consteval = [graph, is_broadcast_or_repeat](graphlib::Node *n) {
-        return graphlib::is_consteval_capable_op(graph, n, true /*allow_forks*/) and not is_broadcast_or_repeat(n);
+    auto user_can_consteval = [graph, is_broadcast_or_repeat, is_dequantize](graphlib::Node *n) {
+        return graphlib::is_consteval_capable_op(graph, n, true /*allow_forks*/) and not is_broadcast_or_repeat(n) and not is_dequantize(n);
     };
     return not has_same_fork_destinations(users) and std::all_of(users.begin(), users.end(), user_can_consteval);
     // TODO: nsmith enable this
diff --git a/pybuda/csrc/passes/dataformat.cpp b/pybuda/csrc/passes/dataformat.cpp
index dbdeb68d..e4a91df3 100644
--- a/pybuda/csrc/passes/dataformat.cpp
+++ b/pybuda/csrc/passes/dataformat.cpp
@@ -562,7 +562,7 @@ void fix_data_formats(graphlib::Graph *graph, bool fp32_acc_supported)
                         op->accumulate_df());
                     op->set_accumulate_df(DataFormat::Int32);
                 }
-                if (op->output_df() != DataFormat::Int32)
+                if (op->output_df() != DataFormat::Int32 and op->op_name() != "dequantization")
                 {
                     // Requantization must be applied
                     if (not (op->buda_attrs().find("has_requant") != op->buda_attrs().end() and
diff --git a/pybuda/csrc/passes/dequant_quant_to_requant.cpp b/pybuda/csrc/passes/dequant_quant_to_requant.cpp
new file mode 100644
index 00000000..c0657959
--- /dev/null
+++ b/pybuda/csrc/passes/dequant_quant_to_requant.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/dequant_quant_to_requant.hpp"
+#include "reportify/reportify.hpp"
+
+namespace tt::passes 
+{
+
+void replace_dq_q_with_req(graphlib::Graph *graph, graphlib::OpNode *dequantize, graphlib::OpNode *quantize) {
+    TT_ASSERT(dequantize->op_type().op == "dequantize" and quantize->op_type().op == "quantize", "Improper ops passed.");
+    TT_ASSERT(graph->data_users(dequantize).size() == 1, "Only support dequant with one child, quantize");
+
+    // The requantize axis should be the axis which contains the size equal to the max of the scale sizes between quantize and dequant
+    graphlib::Node *deq_scale = graph->data_operands(dequantize)[1];
+    graphlib::Node *q_scale = graph->data_operands(quantize)[1];
+
+    int max_size = deq_scale->shape()[0] > q_scale->shape()[0] ? deq_scale->shape()[0] : q_scale->shape()[0];
+
+    int requant_axis = -1;
+    for (int i = (int)quantize->shape().size()-1; i >= 0; i--) {
+        if ((int)quantize->shape()[i] == max_size) {
+            requant_axis = i;
+            break;
+        }
+    }
+    TT_ASSERT(requant_axis >= 0, "Requant axis should have been set");
+
+    std::vector<graphlib::OpType::Attr> requant_attrs{0.0f, 0.0f, requant_axis, true, std::string("torch.int8")};
+
+    for (graphlib::Edge consumer_edge : graph->user_data_edges(quantize)) {
+        std::string name = dequantize->name() + "_" + quantize->name() + "_combined_requantize_" + std::to_string(consumer_edge.edge_creation_id);
+        graphlib::OpNode *requant = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(name, "requantize"),
+                                        graph->get_subgraph_id_for_node(quantize->id()));
+
+        requant->overwrite_op_attrs(requant_attrs);
+        
+        requant->set_shape(quantize->shape());
+        insert_node_on_edge(graph, consumer_edge, requant);
+        graph->add_edge(deq_scale, requant);
+        graph->add_edge(q_scale, requant);
+        requant->set_output_df(tt::DataFormat::Int8);
+    }
+
+    // Remove scale edges so that bypass node works (it requires that the node has one operand)
+    graphlib::Edge old_deq_scale_edge = retrieve_between_edge(graph, deq_scale, dequantize);
+    graphlib::Edge old_q_scale_edge = retrieve_between_edge(graph, q_scale, quantize);
+    graph->remove_edge(old_deq_scale_edge);
+    graph->remove_edge(old_q_scale_edge);
+
+    bypass_node(graph, dequantize, true);
+    bypass_node(graph, quantize, true);
+
+}
+
+bool dequant_quant_to_requant(graphlib::Graph *graph) {
+    
+    bool attempt_update = true;
+    bool graph_changed = false;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *op_node = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op_node)
+                continue;
+            
+            if (graph->data_users(op_node).size() != 1)
+                continue;
+
+            graphlib::OpNode *op_child = dynamic_cast<graphlib::OpNode *>(graph->data_users(op_node)[0]);
+            if (not op_child)
+                continue;
+            
+            // Dequantize should only have one user edge going into the dequantize
+            graphlib::Edge user_edge = graph->user_data_edges(op_node)[0];
+            if (graph->get_edge_attributes(user_edge)->get_tms().size() > 0)
+                continue;
+
+            // Must be a dequantize followed by a quantize
+            if (op_node->op_type().op != "dequantize" or op_child->op_type().op != "quantize")
+                continue;
+
+
+            // Quantize should be producing an int8
+            // if (std::get<std::string>(op_child->op_attrs()[4]) != std::string("torch.int8"))
+            //     continue;
+
+            replace_dq_q_with_req(graph, op_node, op_child);
+            graph_changed = true;
+            attempt_update = true;
+            break;
+
+        }
+    }
+
+    return graph_changed;
+}
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/dequant_quant_to_requant.hpp b/pybuda/csrc/passes/dequant_quant_to_requant.hpp
new file mode 100644
index 00000000..3ffdff68
--- /dev/null
+++ b/pybuda/csrc/passes/dequant_quant_to_requant.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+bool dequant_quant_to_requant(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/erase_inverse_ops.cpp b/pybuda/csrc/passes/erase_inverse_ops.cpp
index ee127287..731b0acb 100644
--- a/pybuda/csrc/passes/erase_inverse_ops.cpp
+++ b/pybuda/csrc/passes/erase_inverse_ops.cpp
@@ -160,10 +160,13 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
             handle_change_rank(graph, clone);
             clone->set_output_df(graph->node_by_id(incoming_edge.producer_node_id)->output_df());
         }
-
+        
+        std::vector<graphlib::OpType::Attr> original_op_attrs{};
         // Set the shape to the desired final shape for this whole path
         if (graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(consumer))
         {   
+            original_op_attrs = op->op_attrs();
+
             graphlib::OpNode *producer_as_op = dynamic_cast<graphlib::OpNode *>(producer);
             if (producer_as_op) {
                 // Must change commute shape, clone shape, and golden transform if there are broadcasts on the incoming edge
@@ -230,15 +233,19 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
                 commute_through_eltwise(op, &commute_shape, &golden_transform);
             }
             else if (is_quantization_ops(op)) {
-                commute_through_quantization(op, &commute_shape, &golden_transform);
+                commute_through_quantization(op, first, false, &commute_shape, &golden_transform);
+            }
+            else if (op->op_name() == "squeeze") {
+                commute_through_squeeze(op, first, &commute_shape, &clone_shape, &golden_transform, false, false);
             }
             log_trace(LogGraphCompiler, "  Op node: {} -> shape set to {}", consumer->name(), commute_shape);
         }
 
         // Handle nary operands (not on this `path`)
         std::vector<graphlib::Edge> consumer_operands = graph->operand_data_edges(consumer);
-        for (graphlib::Edge operand_edge : consumer_operands)
+        for (uint32_t operand_index = 0; operand_index < consumer_operands.size(); operand_index++)
         {
+            graphlib::Edge operand_edge = consumer_operands[operand_index];
             if (operand_edge.producer_node_id == producer->id())
                 continue;
 
@@ -247,7 +254,21 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
             graphlib::Node *clone = graph->add_node(last->clone(name), graph->get_subgraph_id_for_node(operand_edge.producer_node_id));
             graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(clone);
             log_trace(LogGraphCompiler, "  Operand commute clone: {} -> between {} and {} ", name, consumer->name(), graph->node_by_id(operand_edge.producer_node_id)->name());
-            if (retain_operand_dim)
+
+            // Special case for operand clones on a quantization scale
+            auto *consumer_op = dynamic_cast<graphlib::OpNode *>(consumer);
+            if (is_quantization_ops(consumer_op) and operand_index == 1) {
+                
+                // The shape should be all 1's except for (possiby) the quantization axis
+                auto updated_commute_shape = commute_shape;
+                int quant_axis = std::get<int>(consumer_op->op_attrs()[1]);
+                updated_commute_shape[quant_axis] = consumer_op->shape()[quant_axis];
+                update_reshape_attr(op, updated_commute_shape);
+                clone->set_shape(updated_commute_shape);
+                log_trace(LogGraphCompiler, "  Operand commute clone shape: {}", updated_commute_shape);
+                
+            }
+            else if (retain_operand_dim)
             {
                 auto updated_commute_shape = commute_shape;
                 updated_commute_shape[operand_dims.second] = graph->node_by_id(operand_edge.producer_node_id)->shape()[operand_dims.first];
@@ -289,10 +310,12 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
                     }
                 }
                 std::vector<graphlib::OpType> tms = graph->get_edge_attributes(operand_edge)->get_tms();
-
                 for (graphlib::OpType& tm : tms) {
                     if (tm.op == "broadcast") {
                         int dim = std::get<int>(tm.attr[0]);
+                        if (dim >= 0) {
+                            dim -= input->shape().size();
+                        }
                         int volume = std::get<int>(tm.attr[1]);
                         op_shape[dim] *= volume;
                     }
@@ -304,6 +327,7 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
             auto [in_edge, out_edge] = insert_node_on_edge(graph, operand_edge, clone);
             // Set dataformat to match producer on operand edge
             clone->set_output_df(graph->node_by_id(in_edge.producer_node_id)->output_df());
+
             handle_change_rank(graph, clone);
             try_commute_bcast_through_clone(graph, op);
             if (graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(graph->data_operands(clone)[0]))
diff --git a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
new file mode 100644
index 00000000..349e4e0b
--- /dev/null
+++ b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
@@ -0,0 +1,161 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "passes/insert_inverse_outside_quantized_region.hpp"
+#include "passes/erase_inverse_ops.hpp"
+
+#include <pybind11/pybind11.h>
+
+#include <vector>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "passes/commute_utils.hpp"
+#include "passes/passes_utils.hpp"
+#include "reportify/reportify.hpp"
+#include "utils/logger.hpp"
+
+namespace tt::passes
+{
+
+bool is_op_in_quantized_region(graphlib::OpNode *op)
+{
+    std::vector<DataFormat> int_types{
+        DataFormat::Int32,
+        DataFormat::Int8,
+        DataFormat::UInt16,
+        DataFormat::RawUInt8,
+        DataFormat::RawUInt32,
+        DataFormat::RawUInt16};
+    return std::find(int_types.begin(), int_types.end(), op->output_df()) != int_types.end();
+}
+
+static std::vector<graphlib::Node *> find_downward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op) {
+    std::vector<graphlib::Node *> path;
+
+    graphlib::OpNode *iter = initial_op;
+
+    auto clone_shape = initial_op->shape();
+    auto commute_shape = shape_of_only_operand(graph, initial_op);
+
+    bool found_dequantize = false;
+    while (not found_dequantize) {
+        graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(iter);
+        TT_ASSERT(op);
+
+        // For now if there are multiple children then dont commute
+        std::vector<graphlib::Edge> user_edges = graph->user_data_edges(op);
+        if (user_edges.size() > 1)
+            break;
+
+        graphlib::Edge user_edge = user_edges[0];
+        
+        
+        // For now, if there are any edge tms just dont commute
+        if (op != initial_op) {
+            std::vector<graphlib::OpType> tms = graph->get_edge_attributes(user_edge)->get_tms();
+            if (tms.size() > 0) {
+                break;
+            }
+        }
+
+
+        bool can_commute = can_commute_past_op(op, initial_op, graph, &commute_shape, &clone_shape, false);
+        if (not can_commute and op != initial_op) {
+            break;
+        }
+        path.push_back(op);
+        if (is_quantization_ops(op)) 
+            found_dequantize = true;
+
+        iter = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(user_edge.consumer_node_id));
+        if (not iter)
+            break;
+    }
+
+    if (not found_dequantize)
+        path.clear();
+
+    return path;
+}
+
+void insert_inverse_pair_below(graphlib::Graph *graph, graphlib::OpNode *transpose_op, std::vector<graphlib::Edge> edges) {
+
+    const graphlib::OpType orig_op_type = transpose_op->op_type();
+
+    for (graphlib::Edge edge : edges) {
+
+        graphlib::Node *operand = graph->node_by_id(edge.producer_node_id);
+
+        const std::string inverse_name = transpose_op->name() + "_quant_remove_clone" + std::to_string(edge.edge_creation_id);
+        auto *clone_inverse = graph->add_node(transpose_op->clone(inverse_name), graph->get_subgraph_id_for_node(edge.consumer_node_id));
+        graphlib::OpNode *clone_inverse_op = dynamic_cast<graphlib::OpNode *>(clone_inverse);
+
+        clone_inverse_op->op_type().set_attr("dim0", orig_op_type.get_attr("dim1"));
+        clone_inverse_op->op_type().set_attr("dim1", orig_op_type.get_attr("dim0"));
+        clone_inverse_op->op_type().set_attr("z_dim_slice", orig_op_type.get_attr("z_dim_slice"));
+        auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, edge, clone_inverse_op);
+        clone_inverse_op->set_output_df_from_operands(graph);
+        graphlib::Shape clone_inverse_shape = operand->shape();
+        clone_inverse_shape[orig_op_type.get_attr_as<int>("dim0")] = operand->shape()[orig_op_type.get_attr_as<int>("dim1")];
+        clone_inverse_shape[orig_op_type.get_attr_as<int>("dim1")] = operand->shape()[orig_op_type.get_attr_as<int>("dim0")];
+        clone_inverse_op->set_shape(clone_inverse_shape);
+
+        const std::string clone_name = transpose_op->name() + "_quant_remove_clone" + std::to_string(outgoing_edge.edge_creation_id);
+        graphlib::Node* clone = graph->add_node(
+            transpose_op->clone(clone_name), 
+            graph->get_subgraph_id_for_node(edge.consumer_node_id)
+        );
+        graphlib::OpNode *clone_op = dynamic_cast<graphlib::OpNode *>(clone);
+        insert_node_on_edge(graph, outgoing_edge, clone_op);
+        clone_op->set_output_df_from_operands(graph);
+        graphlib::Shape clone_shape = operand->shape();
+        clone_op->set_shape(clone_shape);
+    }
+    
+}
+
+bool insert_inverse_outside_quantized_region(graphlib::Graph *graph)
+{
+    bool updated_anything = false;
+    bool attempt_update = true;
+
+    std::vector<graphlib::Node *> ops_already_checked;
+
+    while (attempt_update)
+    {
+        attempt_update = false;
+        for (auto *node : graphlib::topological_sort(*graph))
+        {
+            graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
+
+            if (not op)
+                continue;
+
+            if (op->op_name() != "transpose")
+                continue;
+
+            if (not is_op_in_quantized_region(op))
+                continue;
+
+            if (std::find(ops_already_checked.begin(), ops_already_checked.end(), op) != ops_already_checked.end())
+                continue;
+
+            std::vector<graphlib::Node*> downward_path = find_downward_path_out(graph, op);
+
+            if (not downward_path.empty()) {
+                // Insert inverse pair on all outgoing edges of last node in downward path
+                graphlib::Node *last_node = downward_path.back();
+                insert_inverse_pair_below(graph, op, graph->user_data_edges(last_node));
+                ops_already_checked.push_back(op);
+                updated_anything = true;
+                attempt_update = true;
+                break;
+            }
+
+        }
+    }
+    reportify::dump_graph(graph->name(), "move_transpose", graph);
+    return updated_anything;
+}
+}  // namespace tt::passes
\ No newline at end of file
diff --git a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.hpp b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.hpp
new file mode 100644
index 00000000..301939fe
--- /dev/null
+++ b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.hpp
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+class OpNode;
+class Shape;
+}
+
+namespace tt::passes
+{
+// Returns true if any transposes were moved outside quantized regions
+bool insert_inverse_outside_quantized_region(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/insert_qdq_on_biases.cpp b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
new file mode 100644
index 00000000..0c354566
--- /dev/null
+++ b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
@@ -0,0 +1,267 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/insert_qdq_on_biases.hpp"
+#include "reportify/reportify.hpp"
+
+
+namespace tt::passes
+{
+
+bool can_insert_on_conv2d_bias(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
+    if (conv2d->op_type().op != "conv2d")
+        return false;
+
+    if (graph->data_operands(conv2d).size() != 3)
+        return false;
+
+    // Both act and weight must have a dequant node as input and the bias cannot
+    graphlib::OpNode *act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
+    graphlib::OpNode *weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
+    graphlib::Node *bias = graph->data_operands(conv2d)[2];
+    graphlib::OpNode *bias_op = dynamic_cast<graphlib::OpNode *>(bias);
+
+    if ((not act) or (not weight))
+        return false;
+    //                                                                                       if bias is nullptr then its just a parameter/constant node which is fine.
+    bool can_insert = (act->op_type().op == "dequantize") and (weight->op_type().op == "dequantize") and ((not bias_op) or bias_op->op_type().op != "dequantize");
+    // bias must be single dim as well
+    can_insert = can_insert and bias->shape().size() == 1;
+    return can_insert;
+}
+
+bool can_insert_on_matmul_bias(graphlib::Graph *graph, graphlib::OpNode *add) {
+    if (add->op_type().op != "add")
+        return false;
+
+    // One of these must be dequant, other will then be bias
+    graphlib::OpNode *lhs = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[0]);
+    graphlib::OpNode *rhs = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[1]);
+
+    graphlib::OpNode *deq;
+    graphlib::Node *bias;
+
+    if (lhs and lhs->op_type().op == "dequantize") {
+        deq = lhs;
+        bias = graph->data_operands(add)[1];
+    } 
+    else if (rhs and rhs->op_type().op == "dequantize") {
+        deq = rhs;
+        bias = graph->data_operands(add)[0];
+    }
+    else {
+        // Neither input is dequantize
+        return false;
+    }
+
+    
+    // For now, the way we know this is a bias-add is if the dequantize nodes input has output_df Int32 
+    // This is because the quantized matmul above returns an Int32.
+    graphlib::Node *deq_input = graph->data_operands(deq)[0];
+    bool can_insert = deq_input->output_df() == tt::DataFormat::Int32;
+
+    // bias must be single dim as well
+    can_insert = can_insert and bias->shape().size() == 1;
+    return can_insert;
+}
+
+bool insert_qdq_on_matmul_bias(graphlib::Graph *graph, graphlib::OpNode *add) {
+    TT_ASSERT(can_insert_on_matmul_bias(graph, add), "Cannot insert qdq on add bias");
+
+    // One of these must be dequant, other will then be bias
+    graphlib::OpNode *lhs = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[0]);
+    graphlib::OpNode *rhs = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[1]);
+
+    graphlib::OpNode *deq;
+    graphlib::Node *bias;
+
+    
+    // Due to the TT_ASSERT at the top of the function, we know one of lhs or rhs must be a dequantize
+    bool bias_is_rhs = false;
+    if (lhs and lhs->op_type().op == "dequantize") {
+        deq = lhs;
+        bias = graph->data_operands(add)[1];
+        bias_is_rhs = true;
+    } 
+    else {
+        deq = rhs;
+        bias = graph->data_operands(add)[0];
+        bias_is_rhs = false;
+    }
+    int axis = std::get<int>(deq->op_attrs()[1]);
+    // Insert unsqueezes to to match the rank of add
+    handle_change_rank(graph, add);
+    if (bias_is_rhs) {
+        bias = graph->data_operands(add)[1];
+    }
+    else {
+        bias = graph->data_operands(add)[0];
+    }
+
+
+    graphlib::Node *scale = graph->data_operands(deq)[1];
+    graphlib::Edge add_bias_edge = retrieve_between_edge(graph, bias, add);
+    std::vector<graphlib::OpType::Attr> quant_attrs{0.0f, axis, std::string("torch.int32")};
+    std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, axis};
+
+    std::string quantize_name = "bias_quantize_insert_" + std::to_string(add_bias_edge.edge_creation_id);
+    graphlib::OpNode *quantize = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(quantize_name, "quantize"), 
+                                           graph->get_subgraph_id_for_node(add->id()));
+
+    quantize->set_shape(bias->shape()); // Use bias shape because we place quantize before tms
+    quantize->overwrite_op_attrs(quant_attrs);
+    quantize->set_output_df(tt::DataFormat::Int32);
+
+    std::string dequantize_name = "bias_dequantize_insert_" + std::to_string(add_bias_edge.edge_creation_id);
+    graphlib::OpNode *dequantize = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(dequantize_name, "dequantize"), 
+                                           graph->get_subgraph_id_for_node(add->id()));
+
+    dequantize->overwrite_op_attrs(dequant_attrs);
+    dequantize->set_output_df(tt::DataFormat::Float32);
+
+    auto edge_tms1 = graph->get_edge_attributes(add_bias_edge)->get_tms();
+    auto [_, out_edge] = insert_node_on_edge(graph, add_bias_edge, quantize, true, true, 0U, true);
+    
+    // Raise broadcast tms to op nodes so that the broadcasts can be consteval'ed into the bias
+    // We do this because pre-placer may insert a matmul to perform a tile broadcast. But then
+    // both inputs would be int32, and we cannot integer matmuls with inputs that are not either
+    // int8 or Uint8
+    std::vector<graphlib::OpType> tms = graph->get_edge_attributes(out_edge)->get_tms();
+    graph->get_edge_attributes(out_edge)->set_tms({});
+    auto current_shape = quantize->shape();
+    for (uint32_t i = 0; i < tms.size(); i++) {
+        auto tm = tms[i];
+        TT_ASSERT(tm.op == "broadcast", "TM must be broadcast");
+        std::string name = "quantized_bias_insertion_raised_" + tm.op + std::to_string(out_edge.edge_creation_id) + "_"+ std::to_string(i);
+        graphlib::OpNode *tm_op = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(name, tm.op), graph->get_subgraph_id_for_node(add->id()));
+        tm_op->overwrite_op_attrs(tm.attr);
+        current_shape[std::get<int>(tm.attr[0])] = std::get<int>(tm.attr[1]);
+        tm_op->set_shape(current_shape);
+        out_edge = insert_node_on_edge(graph, out_edge, tm_op).second;
+        tm_op->set_output_df(tt::DataFormat::Int32);
+    }
+
+    insert_node_on_edge(graph, out_edge, dequantize);
+    dequantize->set_shape(current_shape); // Use shape_of_operand because we place dequantize after tms
+
+
+    graph->add_edge(scale, quantize);
+    graph->add_edge(scale, dequantize);
+    return true;
+}
+
+
+bool insert_qdq_on_conv2d_bias(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
+    TT_ASSERT(can_insert_on_conv2d_bias(graph, conv2d), "Cannot insert qdq on conv2d bias");
+
+    // Both act and weight must have a dequant node as input and the bias cannot
+    graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
+    graphlib::OpNode *deq_weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
+    graphlib::Node *bias = graph->data_operands(conv2d)[2];
+
+    graphlib::Node *deq_act_scale = graph->data_operands(deq_act)[1];
+    graphlib::Node *deq_weight_scale = graph->data_operands(deq_weight)[1];
+
+    std::string scale_multiply_name = conv2d->name() + "_multiply_scales_" + deq_act_scale->name() + "_" + deq_weight_scale->name();
+    graphlib::OpNode *scale_multiply = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(scale_multiply_name, "multiply"), 
+                                           graph->get_subgraph_id_for_node(conv2d->id()));
+
+    uint32_t max_scale_shape = std::max<uint32_t>(deq_act_scale->shape()[0], deq_weight_scale->shape()[0]);
+    graphlib::Shape scale_miltiply_shape = graphlib::Shape::create(std::vector<uint32_t>{max_scale_shape});
+    scale_multiply->set_shape(scale_miltiply_shape);
+
+    graph->add_edge(deq_act_scale, scale_multiply);
+    graph->add_edge(deq_weight_scale, scale_multiply);
+    scale_multiply->set_output_df_from_operands(graph);
+
+    // Potentially add broadcast on scale edge if one of the scales is not shaped [1]
+    if (deq_act_scale->shape()[0] != deq_weight_scale->shape()[0]) {
+        TT_ASSERT(deq_act_scale->shape()[0] == 1 or deq_weight_scale->shape()[0] == 1, "Cannot multiply differently shaped tensors if the dim of one of them is not 1");
+
+        if (deq_act_scale->shape()[0] > deq_weight_scale->shape()[0]) {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq_act_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+        else {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq_weight_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+    }
+
+    std::vector<graphlib::OpType::Attr> quant_attrs{0.0f, (int)0, std::string("torch.int32")};
+    std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, (int)0};
+    graphlib::Edge conv_bias_edge = retrieve_between_edge(graph, bias, conv2d);
+    std::string quantize_name = "bias_quantize_insert_" + std::to_string(conv_bias_edge.edge_creation_id);
+    graphlib::OpNode *quantize = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(quantize_name, "quantize"), 
+                                           graph->get_subgraph_id_for_node(conv2d->id()));
+
+    quantize->set_shape(bias->shape()); // Use bias shape because we place quantize before tms
+    quantize->overwrite_op_attrs(quant_attrs);
+    quantize->set_output_df(tt::DataFormat::Int32);
+
+    std::string dequantize_name = "bias_dequantize_insert_" + std::to_string(conv_bias_edge.edge_creation_id);
+    graphlib::OpNode *dequantize = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(dequantize_name, "dequantize"), 
+                                           graph->get_subgraph_id_for_node(conv2d->id()));
+
+    dequantize->set_shape(bias->shape()); // Use shape_of_operand because we place dequantize after tms
+    dequantize->overwrite_op_attrs(dequant_attrs);
+
+    auto [_, out_edge] = insert_node_on_edge(graph, conv_bias_edge, quantize);
+    insert_node_on_edge(graph, out_edge, dequantize);
+
+    graph->add_edge(scale_multiply, quantize);
+    graph->add_edge(scale_multiply, dequantize);
+    quantize->set_output_df_from_operands(graph);
+    dequantize->set_output_df_from_operands(graph);
+
+    return true;
+}
+
+const std::array<std::string, 2> quantizeable_ops{
+    "add",
+    "conv2d"
+};
+bool insert_qdq_on_biases(graphlib::Graph *graph) {
+    
+    bool attempt_update = true;
+    bool graph_changed = false;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *op_node = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op_node)
+                continue;
+            
+            if (std::find(quantizeable_ops.begin(), quantizeable_ops.end(), op_node->op_type().op) == quantizeable_ops.end())
+                continue;
+
+            if (can_insert_on_conv2d_bias(graph, op_node)) {
+                log_debug(LogGraphCompiler, "Inserting qdq pair on conv2d {}", op_node->name());
+                insert_qdq_on_conv2d_bias(graph, op_node);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            }
+            else if (can_insert_on_matmul_bias(graph, op_node)) {
+                log_debug(LogGraphCompiler, "Inserting qdq pair on add {}", op_node->name());
+                insert_qdq_on_matmul_bias(graph, op_node);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            }
+
+        }
+    }
+
+    return graph_changed;
+}
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/insert_qdq_on_biases.hpp b/pybuda/csrc/passes/insert_qdq_on_biases.hpp
new file mode 100644
index 00000000..6172e35b
--- /dev/null
+++ b/pybuda/csrc/passes/insert_qdq_on_biases.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+bool insert_qdq_on_biases(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/make_quantized_ops.cpp b/pybuda/csrc/passes/make_quantized_ops.cpp
new file mode 100644
index 00000000..c5be72c5
--- /dev/null
+++ b/pybuda/csrc/passes/make_quantized_ops.cpp
@@ -0,0 +1,365 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/make_quantized_ops.hpp"
+#include "reportify/reportify.hpp"
+#include <iostream>
+
+namespace tt::passes
+{
+
+bool is_quantizeable_matmul(graphlib::Graph *graph, graphlib::Node *matmul) {
+    
+    graphlib::OpNode *matmul_op = dynamic_cast<graphlib::OpNode *>(matmul);
+    if (not matmul_op)
+        return false;
+
+    if (matmul_op->op_type().op != "matmul")
+        return false;
+
+    // Both inputs must be dequantize nodes
+    for (graphlib::Node *operand : graph->data_operands(matmul)) {
+        graphlib::OpNode *operand_op = dynamic_cast<graphlib::OpNode *>(operand);
+        if (not operand_op)
+            return false;
+
+        if (operand_op->op_type().op != "dequantize")
+            return false;
+    }
+
+    return true;
+}
+
+bool is_quantizeable_add(graphlib::Graph *graph, graphlib::Node *add) {
+    
+    graphlib::OpNode *add_op = dynamic_cast<graphlib::OpNode *>(add);
+    if (not add_op)
+        return false;
+
+    if (add_op->op_type().op != "add")
+        return false;
+
+    // Both inputs must be dequantize nodes
+    std::vector<graphlib::Node *> scales;
+    for (graphlib::Node *operand : graph->data_operands(add_op)) {
+        graphlib::OpNode *operand_op = dynamic_cast<graphlib::OpNode *>(operand);
+        if (not operand_op)
+            return false;
+
+        if (operand_op->op_type().op != "dequantize")
+            return false;
+
+        scales.push_back(graph->data_operands(operand_op)[1]);
+    }
+
+    // Scales to dequant must be identical
+    return scales[0] == scales[1];
+}
+
+bool is_quantizeable_conv2d(graphlib::Graph *graph, graphlib::Node *conv2d) {
+    graphlib::OpNode *conv_op = dynamic_cast<graphlib::OpNode *>(conv2d);
+    if (not conv_op)
+        return false;
+
+    if (conv_op->op_type().op != "conv2d")
+        return false;
+
+    // All inputs must be dequantize nodes
+    for (graphlib::Node *operand : graph->data_operands(conv2d)) {
+        graphlib::OpNode *operand_op = dynamic_cast<graphlib::OpNode *>(operand);
+        if (not operand_op)
+            return false;
+
+        if (operand_op->op_type().op != "dequantize")
+            return false;
+    }
+
+    // The scale of the bias dequant must be equal to the product of the scales of the act and weight
+    graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
+    graphlib::OpNode *deq_weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
+    graphlib::OpNode *deq_bias = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[2]);
+
+    graphlib::Node *deq_act_scale = graph->data_operands(deq_act)[1];
+    graphlib::Node *deq_weight_scale = graph->data_operands(deq_weight)[1];
+    graphlib::Node *deq_bias_scale = graph->data_operands(deq_bias)[1];
+    graphlib::OpNode *deq_bias_scale_op = dynamic_cast<graphlib::OpNode *>(deq_bias_scale);
+    
+    if (not deq_bias_scale_op or deq_bias_scale_op->op_type().op != "multiply")
+        return false;
+
+    std::vector<graphlib::Node *> bias_scale_multiply_operands = graph->data_operands(deq_bias_scale_op);
+
+    bool bias_scale_valid = (bias_scale_multiply_operands[0] == deq_act_scale and bias_scale_multiply_operands[1] == deq_weight_scale)
+                            or (bias_scale_multiply_operands[1] == deq_act_scale and bias_scale_multiply_operands[0] == deq_weight_scale);
+
+    return bias_scale_valid;
+}
+
+void make_quantized_matmul(graphlib::Graph *graph, graphlib::OpNode *matmul) {
+    TT_ASSERT(matmul, "Null OpNode pointer given.");
+    TT_ASSERT(matmul->op_type().op == "matmul", "OpNode is not matmul");
+    TT_ASSERT(is_quantizeable_matmul(graph, matmul), "Matmul is not quantizeable.");
+
+    graphlib::OpNode *deq0 = dynamic_cast<graphlib::OpNode *>(graph->data_operands(matmul)[0]);
+    graphlib::OpNode *deq1 = dynamic_cast<graphlib::OpNode *>(graph->data_operands(matmul)[1]);
+
+    graphlib::Node *deq0_scale = graph->data_operands(deq0)[1];
+    graphlib::Node *deq1_scale = graph->data_operands(deq1)[1];
+
+    // We convert the dequant axis to to a negative index because the matmul
+    // shape size might be larger than the shape of deq1 
+    // i.e deq1 - [32, 32], matmul - [1, 1, 32, 32]
+    int new_deq_axis = std::get<int>(deq1->op_attrs()[1]);
+    if (new_deq_axis >= 0)
+        new_deq_axis -= deq1->shape().size();
+
+    // Must multiply the scales of both inputs to create new scale
+    std::string scale_multiply_name = matmul->name() + "_multiply_scales_" + deq0_scale->name() + "_" + deq1_scale->name();
+    graphlib::OpNode *scale_multiply = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(scale_multiply_name, "multiply"), 
+                                           graph->get_subgraph_id_for_node(matmul->id()));
+
+
+    uint32_t max_scale_shape = std::max<uint32_t>(deq0_scale->shape()[0], deq1_scale->shape()[0]);
+    graphlib::Shape scale_miltiply_shape = graphlib::Shape::create(std::vector<uint32_t>{max_scale_shape});
+    scale_multiply->set_shape(scale_miltiply_shape);
+
+    graph->add_edge(deq0_scale, scale_multiply);
+    graph->add_edge(deq1_scale, scale_multiply);
+    scale_multiply->set_output_df_from_operands(graph);
+
+    // Potentially add broadcast on scale edge if one of the scales is not shaped [1]
+    if (deq0_scale->shape()[0] != deq1_scale->shape()[0]) {
+        TT_ASSERT(deq0_scale->shape()[0] == 1 or deq1_scale->shape()[0] == 1, "Cannot multiply differently shaped tensors if the dim of one of them is not 1");
+
+        if (deq0_scale->shape()[0] > deq1_scale->shape()[0]) {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq0_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+        else {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq1_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+    }
+
+    // Make dequant axis positive again, this time using matmul shape
+    // as that is the new input to dequant.
+    if (new_deq_axis < 0)
+        new_deq_axis += matmul->shape().size();
+
+    // Add dequantize node after matmul for all consumer edges
+    std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, new_deq_axis};
+
+    for (graphlib::Edge consumer_edge : graph->user_data_edges(matmul)) {
+        std::string dequant_name = "dequantize_post_matmul_" + std::to_string(consumer_edge.edge_creation_id);
+        graphlib::OpNode *dequant = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(dequant_name, "dequantize"),
+                                        graph->get_subgraph_id_for_node(matmul->id()));
+        dequant->overwrite_op_attrs(dequant_attrs);
+        dequant->set_shape(matmul->shape());
+        insert_node_on_edge(graph, consumer_edge, dequant);
+        graph->add_edge(scale_multiply, dequant);
+    }
+
+    // Remove scale edges so that bypass node works (it requires that the node has one operand)
+    graphlib::Edge old_deq0_scale_edge = retrieve_between_edge(graph, deq0_scale, deq0);
+    graphlib::Edge old_deq1_scale_edge = retrieve_between_edge(graph, deq1_scale, deq1);
+    graph->remove_edge(old_deq0_scale_edge);
+    graph->remove_edge(old_deq1_scale_edge);
+
+    bypass_node(graph, deq0, true);
+    bypass_node(graph, deq1, true);
+    matmul->set_output_df(DataFormat::Int32);
+}
+
+void make_quantized_add(graphlib::Graph *graph, graphlib::OpNode *add) {
+    TT_ASSERT(add, "Null OpNode pointer given.");
+    TT_ASSERT(add->op_type().op == "add", "OpNode is not add");
+    TT_ASSERT(is_quantizeable_add(graph, add), "add is not quantizeable.");
+
+    graphlib::OpNode *deq0 = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[0]);
+    graphlib::OpNode *deq1 = dynamic_cast<graphlib::OpNode *>(graph->data_operands(add)[1]);
+
+    // We already know from is_quantizeable_add that both dequant nodes share the same scale
+    graphlib::Node *scale = graph->data_operands(deq0)[1];
+
+    int new_deq_axis = std::get<int>(deq1->op_attrs()[1]);
+    if (new_deq_axis >= 0)
+        new_deq_axis = new_deq_axis - deq1->shape().size() + add->shape().size();
+
+    
+    std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, new_deq_axis};
+    for (graphlib::Edge consumer_edge : graph->user_data_edges(add)) {
+        std::string dequant_name = "dequantize_post_add_" + std::to_string(consumer_edge.edge_creation_id);
+        graphlib::OpNode *dequant = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(dequant_name, "dequantize"),
+                                        graph->get_subgraph_id_for_node(add->id()));
+        dequant->overwrite_op_attrs(dequant_attrs);
+        dequant->set_shape(add->shape());
+        insert_node_on_edge(graph, consumer_edge, dequant);
+        graph->add_edge(scale, dequant);
+    }
+
+    // Remove scale edges so that bypass node works (it requires that the node has one operand)
+    graphlib::Node *deq0_scale = graph->data_operands(deq0)[1];
+    graphlib::Node *deq1_scale = graph->data_operands(deq1)[1];
+    graphlib::Edge old_deq0_scale_edge = retrieve_between_edge(graph, deq0_scale, deq0);
+    graphlib::Edge old_deq1_scale_edge = retrieve_between_edge(graph, deq1_scale, deq1);
+    graph->remove_edge(old_deq0_scale_edge);
+    graph->remove_edge(old_deq1_scale_edge);
+
+    bypass_node(graph, deq0, true);
+    bypass_node(graph, deq1, true);
+    add->set_output_df(DataFormat::Int32);
+    
+}
+
+void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
+    TT_ASSERT(conv2d, "Null OpNode pointer given.");
+    TT_ASSERT(conv2d->op_type().op == "conv2d", "OpNode is not conv2d");
+    TT_ASSERT(is_quantizeable_conv2d(graph, conv2d), "conv2d is not quantizeable.");
+
+    graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
+    graphlib::OpNode *deq_weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
+    graphlib::OpNode *deq_bias = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[2]);
+
+    graphlib::Node *deq_act_scale = graph->data_operands(deq_act)[1];
+    graphlib::Node *deq_weight_scale = graph->data_operands(deq_weight)[1];
+    graphlib::Node *deq_bias_scale = graph->data_operands(deq_bias)[1];
+
+    // We convert the dequant axis to to a negative index because the conv
+    // shape size might be larger than the shape of deq1 
+    // i.e deq1 - [32, 32], matmul - [1, 1, 32, 32]
+    int new_deq_axis = std::get<int>(deq_weight->op_attrs()[1]);
+    if (new_deq_axis >= 0)
+        new_deq_axis -= deq_weight->shape().size();
+
+    // Must multiply the scales of both inputs to create new scale
+    std::string scale_multiply_name = conv2d->name() + "multiply_scales_" + deq_act_scale->name() + "_" + deq_weight_scale->name();
+    graphlib::OpNode *scale_multiply = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(scale_multiply_name, "multiply"), 
+                                           graph->get_subgraph_id_for_node(conv2d->id()));
+
+
+    uint32_t max_scale_shape = std::max<uint32_t>(deq_act_scale->shape()[0], deq_weight_scale->shape()[0]);
+    graphlib::Shape scale_miltiply_shape = graphlib::Shape::create(std::vector<uint32_t>{max_scale_shape});
+    scale_multiply->set_shape(scale_miltiply_shape);
+
+    graph->add_edge(deq_act_scale, scale_multiply);
+    graph->add_edge(deq_weight_scale, scale_multiply);
+    scale_multiply->set_output_df_from_operands(graph);
+
+    // Potentially add broadcast on scale edge if one of the scales is not shaped [1]
+    if (deq_act_scale->shape()[0] != deq_weight_scale->shape()[0]) {
+        TT_ASSERT(deq_act_scale->shape()[0] == 1 or deq_weight_scale->shape()[0] == 1, "Cannot multiply differently shaped tensors if the dim of one of them is not 1");
+
+        if (deq_act_scale->shape()[0] > deq_weight_scale->shape()[0]) {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq_act_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+        else {
+            graphlib::Edge edge = retrieve_between_edge(graph, deq_weight_scale, scale_multiply);
+            graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+        }
+    }
+
+    // Make dequant axis positive again, this time using matmul shape
+    // as that is the new input to dequant.
+    if (new_deq_axis < 0)
+        new_deq_axis += conv2d->shape().size();
+
+    // The dequant axis may be 0 since conv weights may have a w dim
+    if (new_deq_axis == 0)
+        new_deq_axis = 1;
+
+    // Add dequantize node after matmul for all consumer edges
+    std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, new_deq_axis};
+
+    for (graphlib::Edge consumer_edge : graph->user_data_edges(conv2d)) {
+        std::string dequant_name = "dequantize_post_conv2d_" + std::to_string(consumer_edge.edge_creation_id);
+        graphlib::OpNode *dequant = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(dequant_name, "dequantize"),
+                                        graph->get_subgraph_id_for_node(conv2d->id()));
+        dequant->overwrite_op_attrs(dequant_attrs);
+        dequant->set_shape(conv2d->shape());
+        insert_node_on_edge(graph, consumer_edge, dequant);
+        graph->add_edge(scale_multiply, dequant);
+    }
+
+    // Remove scale edges so that bypass node works (it requires that the node has one operand)
+    graphlib::Edge old_deq_act_scale_edge = retrieve_between_edge(graph, deq_act_scale, deq_act);
+    graphlib::Edge old_deq_weight_scale_edge = retrieve_between_edge(graph, deq_weight_scale, deq_weight);
+    graphlib::Edge old_deq_bias_scale_edge = retrieve_between_edge(graph, deq_bias_scale, deq_bias);
+    graph->remove_edge(old_deq_act_scale_edge);
+    graph->remove_edge(old_deq_weight_scale_edge);
+    graph->remove_edge(old_deq_bias_scale_edge);
+
+    bypass_node(graph, deq_act, true);
+    bypass_node(graph, deq_weight, true);
+    bypass_node(graph, deq_bias, true);
+    conv2d->set_output_df(DataFormat::Int32);
+}
+
+const std::array<std::string, 3> quantizeable_ops{
+    "matmul",
+    "conv2d",
+    "add"
+};
+bool make_quantized_ops(graphlib::Graph *graph) {
+    /* 
+    * This pass converts the following pattern (also works for conv2d):
+    *
+    *   dequantize   dequantize               ...          ...
+    *       |            |                     |            |
+    *        \          /         =====>        \          /
+    *         \        /                         \        /
+    *           matmul                        matmul (quantized)
+    *                                                |
+    *                                                |
+    *                                            dequantize
+    */
+    
+    bool attempt_update = true;
+    bool graph_changed = false;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *op_node = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op_node)
+                continue;
+            
+            if (std::find(quantizeable_ops.begin(), quantizeable_ops.end(), op_node->op_type().op) == quantizeable_ops.end())
+                continue;
+
+            if (is_quantizeable_matmul(graph, op_node)) {
+                log_debug(LogGraphCompiler, "Making quantized matmul {}", op_node->name());
+                make_quantized_matmul(graph, op_node);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            }
+            else if (is_quantizeable_conv2d(graph, op_node)) {
+                log_debug(LogGraphCompiler, "Making quantized conv2d {}", op_node->name());
+                make_quantized_conv2d(graph, op_node);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            } else if (is_quantizeable_add(graph, op_node)) {
+                log_debug(LogGraphCompiler, "Making quantized add {}", op_node->name());
+                make_quantized_add(graph, op_node);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            }
+
+        }
+    }
+
+    return graph_changed;
+}
+
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/make_quantized_ops.hpp b/pybuda/csrc/passes/make_quantized_ops.hpp
new file mode 100644
index 00000000..d2fa7123
--- /dev/null
+++ b/pybuda/csrc/passes/make_quantized_ops.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+bool make_quantized_ops(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/move_dequantize.cpp b/pybuda/csrc/passes/move_dequantize.cpp
new file mode 100644
index 00000000..cd4ef9ba
--- /dev/null
+++ b/pybuda/csrc/passes/move_dequantize.cpp
@@ -0,0 +1,356 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/move_dequantize.hpp"
+#include "reportify/reportify.hpp"
+#include <iostream>
+using namespace std;
+namespace tt::passes
+{
+
+bool dequantize_can_commute_reshape(graphlib::Graph *graph, graphlib::Node *reshape) {
+    graphlib::OpNode *deq_node = dynamic_cast<graphlib::OpNode *>(graph->data_operands(reshape)[0]);
+    TT_ASSERT(deq_node->op_type().op == "dequantize", "Reshape operand is not dequantize!");
+
+    // If the reshape is equivalent to a squeeze/unsqueeze we should be able to move the dequantize through
+    uint32_t before_ones_count = 0;
+    uint32_t after_ones_count = 0;
+    for (uint32_t dim_size : deq_node->shape().as_vector()) {
+        if (dim_size == 1)
+            before_ones_count++;
+    }
+    for (uint32_t dim_size : reshape->shape().as_vector()) {
+        if (dim_size == 1)
+            after_ones_count++;
+    }
+
+    int32_t difference = before_ones_count - after_ones_count;
+    return difference == 1 or difference == -1;
+}
+
+bool dequantize_can_commute_hslice(graphlib::Graph *graph, graphlib::Node *hslice) {
+    graphlib::OpNode *deq_node = dynamic_cast<graphlib::OpNode *>(graph->data_operands(hslice)[0]);
+    TT_ASSERT(deq_node->op_type().op == "dequantize", "HSlice operand is not dequantize!");
+
+    int deq_axis = std::get<int>(deq_node->op_attrs()[1]);
+    graphlib::OpNode *hslice_op = dynamic_cast<graphlib::OpNode *>(hslice);
+    TT_ASSERT(hslice_op, "Expecting an OpNode");
+    TT_ASSERT(hslice_op->op_type().op == "hslice", "Expecting an hslice op.");
+
+    // We can swap the dequant and hslice so long as the dequant axis is not the z-dim or c-dim
+    bool is_c_dim = deq_axis == (int)deq_node->shape().size()-1 or deq_axis == -1;
+    bool is_z_dim = deq_node->shape().size() >= 3 and (deq_axis == (int)deq_node->shape().size()-3 and deq_axis == -3);
+
+    // If the dequant shape has 3 dimensions and the dequant axis is the z-dim, then the z-dim must be zero and
+    // the dequant axis can remain on the first dimension (w-dim) as hslice will yield a 4 dimension result
+    bool can_commute = (not is_c_dim) and ((not is_z_dim) or deq_node->shape().size() == 3);
+
+    return can_commute;
+}
+
+bool dequantize_can_commute_hstack(graphlib::Graph *graph, graphlib::Node *hstack) {
+    graphlib::OpNode *deq_node = dynamic_cast<graphlib::OpNode *>(graph->data_operands(hstack)[0]);
+    TT_ASSERT(deq_node->op_type().op == "dequantize", "HStack operand is not dequantize!");
+
+    int deq_axis = std::get<int>(deq_node->op_attrs()[1]);
+    graphlib::OpNode *hslice_op = dynamic_cast<graphlib::OpNode *>(hstack);
+    TT_ASSERT(hslice_op, "Expecting an OpNode");
+    TT_ASSERT(hslice_op->op_type().op == "hstack", "Expecting an hstack op.");
+
+    // We can swap the dequant and hslice so long as the dequant axis is not the z-dim or c-dim
+    bool is_c_dim = deq_axis == (int)deq_node->shape().size()-1 or deq_axis == -1;
+    bool is_z_dim = deq_node->shape().size() >= 3 and (deq_axis == (int)deq_node->shape().size()-3 and deq_axis == -3);
+
+    return (not is_z_dim) and (not is_c_dim);
+}
+
+bool op_commutes_dequantize(graphlib::Graph *graph, graphlib::Node *node) {
+    /*
+    Defines which ops commute dequantize node, meaning which ops can be done in int8 and produce the same output
+    */
+    graphlib::PyOpNode *op_node = node->as<graphlib::PyOpNode>();
+    bool can_commute = op_node->op_type().op == "relu";
+    can_commute = can_commute or (op_node->op_type().op == "reshape" and dequantize_can_commute_reshape(graph, op_node));
+    can_commute = can_commute or op_node->op_type().op == "transpose";
+    can_commute = can_commute or (op_node->op_type().op == "hslice"  and dequantize_can_commute_hslice(graph, op_node));
+    can_commute = can_commute or (op_node->op_type().op == "hstack"  and dequantize_can_commute_hstack(graph, op_node));
+    return can_commute;
+}
+
+tt::graphlib::Node * get_user(graphlib::Graph *graph, tt::graphlib::Node *node) {
+    /*
+    Gets the first user (consumer) of node
+    In case that node has more consumers, returns only first one
+    */
+    auto users = graph->data_users(node);
+    TT_ASSERT(users.size() > 0, "Node has no outputs");
+    tt::graphlib::Node *user = users[0];
+
+    return user;
+}
+
+tt::graphlib::Edge get_edge_from_parent_to_opnode(graphlib::Graph *graph, tt::graphlib::Node *op_node, tt::graphlib::Node *parent_node) {
+    /*
+    Returns the op_node_parent -> op_node edge
+    */
+    std::vector<tt::graphlib::Edge> edges  = graph->operand_data_edges(op_node);
+    for (auto edge : edges) {
+        if (edge.producer_node_id == parent_node->id())
+            return edge;
+    }
+    return edges[0];
+}
+
+void insert_edge(graphlib::Graph *graph, tt::graphlib::NodeId input_node_id, tt::graphlib::PortId input_node_port_id, tt::graphlib::NodeId output_node_id, tt::graphlib::PortId output_node_port_id) {
+    tt::graphlib::Edge skip_deq_edge = tt::graphlib::Edge(
+        input_node_id, 
+        input_node_port_id, 
+        output_node_id,
+        output_node_port_id, 
+        graphlib::EdgeType::kData
+    );
+    graph->add_edge(skip_deq_edge);
+} 
+
+void move_dequant_through_hslice(graphlib::Graph *graph, graphlib::Node *deq_node, graphlib::Node *hslice) {
+    TT_ASSERT(dequantize_can_commute_hslice(graph, hslice), "Dequantize cannot commute through hslice");
+    graphlib::OpNode *deq_node_op = dynamic_cast<graphlib::OpNode *>(deq_node);
+    graphlib::OpNode *hslice_op = dynamic_cast<graphlib::OpNode *>(hslice);
+    graphlib::Node *scale = graph->data_operands(deq_node_op)[1];
+
+    std::vector<graphlib::OpType::Attr> deq_attrs = deq_node_op->op_attrs();
+    int orig_deq_axis = std::get<int>(deq_attrs[1]);
+
+    int new_deq_axis = orig_deq_axis;
+    // If the scale shape volume is 1 then there is no need to change dequant axis
+    uint32_t scale_volume = scale->shape().volume();
+    if (scale_volume > 1) {
+        // If the scale volume is > 1 then the dequant axis must be that which has the same size as the scale
+        // Since the scale should only ever be a 1-D vector by this point in compilation, the volume is the size we are looking for
+        
+        for (uint32_t i = 0; i < hslice_op->shape().size(); i++) {
+            if (hslice_op->shape()[i] == scale_volume) {
+                new_deq_axis = i;
+                break;
+            }
+        }
+    }
+
+    deq_attrs[1] = new_deq_axis;
+    deq_node_op->overwrite_op_attrs(deq_attrs);
+    deq_node_op->set_shape(hslice_op->shape());
+
+    // Clone hslice op and place on first operand of dequantize
+    graphlib::Edge edge = retrieve_between_edge(graph, deq_node, hslice);
+    std::string name = hslice->name() + "_dequant_commute_clone" + std::to_string(edge.producer_node_id);
+    graphlib::Node *hslice_clone = graph->add_node(hslice->clone(name), graph->get_subgraph_id_for_node(edge.producer_node_id));
+    graphlib::Edge input_edge = graph->operand_data_edges(deq_node)[0]; // Values to dequantize are the first input
+    insert_node_on_edge(graph, input_edge, hslice_clone);
+
+    bypass_node(graph, hslice, true);
+}
+
+void move_dequant_through_hstack(graphlib::Graph *graph, graphlib::Node *deq_node, graphlib::Node *hstack) {
+    TT_ASSERT(dequantize_can_commute_hstack(graph, hstack), "Dequantize cannot commute through hstack");
+    graphlib::OpNode *deq_node_op = dynamic_cast<graphlib::OpNode *>(deq_node);
+    graphlib::OpNode *hstack_op = dynamic_cast<graphlib::OpNode *>(hstack);
+    graphlib::Node *scale = graph->data_operands(deq_node_op)[1];
+
+    std::vector<graphlib::OpType::Attr> deq_attrs = deq_node_op->op_attrs();
+    int orig_deq_axis = std::get<int>(deq_attrs[1]);
+
+    int new_deq_axis = orig_deq_axis;
+    // If the scale shape volume is 1 then there is no need to change dequant axis
+    uint32_t scale_volume = scale->shape().volume();
+    for (int32_t i = hstack_op->shape().size()-1; i >= 0; i--) {
+        if (hstack_op->shape()[i] == scale_volume) {
+            new_deq_axis = i;
+            break;
+        }
+    }
+
+    deq_attrs[1] = new_deq_axis;
+    deq_node_op->overwrite_op_attrs(deq_attrs);
+    deq_node_op->set_shape(hstack_op->shape());
+
+    // Clone hslice op and place on first operand of dequantize
+    graphlib::Edge edge = retrieve_between_edge(graph, deq_node, hstack);
+    std::string name = hstack->name() + "_dequant_commute_clone" + std::to_string(edge.producer_node_id);
+    graphlib::Node *hstack_clone = graph->add_node(hstack->clone(name), graph->get_subgraph_id_for_node(edge.producer_node_id));
+    graphlib::Edge input_edge = graph->operand_data_edges(deq_node)[0]; // Values to dequantize are the first input
+    insert_node_on_edge(graph, input_edge, hstack_clone);
+
+    bypass_node(graph, hstack, true);
+}
+
+void move_dequant_through_reshape(graphlib::Graph *graph, graphlib::Node *deq_node, graphlib::Node *reshape) {
+    TT_ASSERT(dequantize_can_commute_reshape(graph, reshape), "Dequantize cannot commute through reshape");
+
+    int32_t min_shape_size = deq_node->shape().size() > reshape->shape().size() ? reshape->shape().size() : deq_node->shape().size();
+    graphlib::Edge edge = retrieve_between_edge(graph, deq_node, reshape);
+    (void)edge;
+    bool is_squeeze = reshape->shape().size() == (uint32_t)min_shape_size;
+
+    // Find which dimension has been added/removed
+    int32_t changed_dim = 0;
+    bool found_changed_dim = false;
+    for (int32_t dim = -1; dim >= -min_shape_size; dim--) {
+        if (deq_node->shape()[dim] != reshape->shape()[dim]) {
+            found_changed_dim = true;
+            changed_dim = dim;
+            break;
+        }
+    }
+
+    if (not found_changed_dim) {
+        changed_dim -= 1; // Dim was added/removed at the very front
+    }
+
+    graphlib::OpNode *deq_node_op = dynamic_cast<graphlib::OpNode *>(deq_node);
+    std::vector<graphlib::OpType::Attr> op_attrs = deq_node_op->op_attrs();
+    int32_t deq_axis = std::get<int>(op_attrs[1]);
+    
+    // Convert dequant axis to positive
+    if (deq_axis < 0) {
+        deq_axis += deq_node->shape().size();
+    }
+
+    if (is_squeeze) {
+        if (changed_dim < deq_axis)
+            deq_axis -= 1;
+    }
+    else {
+        if (changed_dim <= deq_axis)
+            deq_axis += 1;
+    }
+    op_attrs[1] = deq_axis;
+
+    std::string name = reshape->name() + "_dequant_commute_clone" + std::to_string(edge.producer_node_id);
+    graphlib::Node *clone = graph->add_node(reshape->clone(name), graph->get_subgraph_id_for_node(edge.producer_node_id));
+    graphlib::OpNode *clone_op = dynamic_cast<graphlib::OpNode *>(clone);
+
+    graphlib::Edge input_edge = graph->operand_data_edges(deq_node)[0]; // Values to dequantize are the first input
+    insert_node_on_edge(graph, input_edge, clone);
+    clone_op->set_output_df_from_operands(graph);
+    bypass_node(graph, reshape, true);
+
+    deq_node_op->set_shape(clone_op->shape());
+    deq_node_op->overwrite_op_attrs(op_attrs);
+}
+
+void move_dequant_through_transpose(graphlib::Graph *graph, graphlib::Node *deq_node, graphlib::Node *transpose) {
+    graphlib::OpNode *deq_node_op = dynamic_cast<graphlib::OpNode *>(deq_node);
+    graphlib::Edge edge = retrieve_between_edge(graph, deq_node, transpose);
+
+
+    std::vector<graphlib::OpType::Attr> deq_attrs = deq_node_op->op_attrs();
+    int deq_axis = std::get<int>(deq_attrs[1]);
+    if (deq_axis < 0)
+        deq_axis += deq_node->shape().size();
+
+    graphlib::OpNode *transpose_op = dynamic_cast<graphlib::OpNode *>(transpose);
+    int dim0 = transpose_op->op_type().get_attr_as<int>("dim0");
+    if (dim0 < 0)
+        dim0 += transpose->shape().size();
+    int dim1 = transpose_op->op_type().get_attr_as<int>("dim1");
+    if (dim1 < 0)
+        dim1 += transpose->shape().size();
+
+    if (dim0 == deq_axis)
+        deq_axis = dim1;
+    else if (dim1 == deq_axis)
+        deq_axis = dim0;
+
+    deq_attrs[1] = deq_axis;
+
+    std::string name = transpose->name() + "_dequant_commute_clone" + std::to_string(edge.producer_node_id);
+    graphlib::Node *clone = graph->add_node(transpose->clone(name), graph->get_subgraph_id_for_node(edge.producer_node_id));
+    graphlib::OpNode *clone_op = dynamic_cast<graphlib::OpNode *>(clone);
+
+    graphlib::Edge input_edge = graph->operand_data_edges(deq_node)[0]; // Values to dequantize are the first input
+    insert_node_on_edge(graph, input_edge, clone);
+    clone_op->set_output_df_from_operands(graph);
+
+    // Bypass the transpose node (without removing it) and sway the index of dequantization
+    bypass_node(graph, transpose, true);
+    deq_node_op->set_shape(clone_op->shape());
+    deq_node_op->overwrite_op_attrs(deq_attrs);
+}
+
+void move_dequant_through_relu(graphlib::Graph *graph, graphlib::OpNode *deq_node, graphlib::OpNode *relu) {
+    // Clone relu op onto the dequantize input and bypass the original relu
+    tt::graphlib::Edge deq_data_input_edge  = graph->operand_data_edges(deq_node)[0];
+    std::string name = relu->name() + "_dequant_commute_clone" + std::to_string(deq_data_input_edge.producer_node_id);
+    graphlib::Node* relu_clone = graph->add_node(relu->clone(name), graph->get_subgraph_id_for_node(deq_data_input_edge.producer_node_id));
+    graphlib::OpNode *relu_clone_op = dynamic_cast<graphlib::OpNode *>(relu_clone);
+
+    insert_node_on_edge(graph, deq_data_input_edge, relu_clone);
+    relu_clone_op->set_output_df_from_operands(graph);
+    relu_clone_op->set_shape(deq_node->shape());
+    bypass_node(graph, relu, true);
+}
+
+void swap_dequant_and_child(graphlib::Graph *graph, graphlib::OpNode *deq_node, graphlib::OpNode *child) {
+    log_debug(LogGraphCompiler, "Swapping {} and {}", deq_node->name(), child->name());
+    if (child->op_type().op == "relu")
+        move_dequant_through_relu(graph, deq_node, child);
+    else if (child->op_type().op == "reshape")
+        move_dequant_through_reshape(graph, deq_node, child);
+    else if (child->op_type().op == "transpose")
+        move_dequant_through_transpose(graph, deq_node, child);
+    else if (child->op_type().op == "hslice")
+        move_dequant_through_hslice(graph, deq_node, child);
+    else if (child->op_type().op == "hstack")
+        move_dequant_through_hstack(graph, deq_node, child);
+
+}
+
+bool move_dequantize(graphlib::Graph *graph) {
+/*
+Moves dequantize op lower in the graph
+All nodes that dequantize moves past become quantized meaning they get int32 as input
+Currently works for graph with no forks between dequantize current and desired position
+*/
+    bool attempt_update = true;
+    bool graph_changed = false;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *deq_node = dynamic_cast<graphlib::OpNode *>(node);
+
+            if (not deq_node)
+                continue;
+
+            if (deq_node->op_type().op != "dequantize") 
+                continue;
+
+            if (graph->data_users(deq_node).size() != 1)
+                continue;
+
+            log_debug(LogGraphCompiler, "Found dequant op: {}", deq_node->name());
+            graphlib::OpNode *child = dynamic_cast<graphlib::OpNode *>(graph->data_users(deq_node)[0]);
+
+            if(child and op_commutes_dequantize(graph, child)) {
+                log_debug("Commuting {} through {}", deq_node->name(), child->name());
+                swap_dequant_and_child(graph, deq_node, child);
+                attempt_update = true;
+                graph_changed = true;
+                break;
+            }
+
+        }
+    }
+
+    return graph_changed;
+}
+
+}
diff --git a/pybuda/csrc/passes/move_dequantize.hpp b/pybuda/csrc/passes/move_dequantize.hpp
new file mode 100644
index 00000000..84d021c9
--- /dev/null
+++ b/pybuda/csrc/passes/move_dequantize.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+bool move_dequantize(graphlib::Graph *graph);
+}
diff --git a/pybuda/csrc/passes/move_requantize.cpp b/pybuda/csrc/passes/move_requantize.cpp
index 23eecee0..dc8c3337 100644
--- a/pybuda/csrc/passes/move_requantize.cpp
+++ b/pybuda/csrc/passes/move_requantize.cpp
@@ -110,7 +110,7 @@ void commute_through_requant(graphlib::Graph *graph, std::vector<graphlib::Node
                 commute_through_eltwise(op, &commute_shape, &golden_transform);
             }
             else if (is_quantization_ops(op)) {
-                commute_through_quantization(op, &commute_shape, &golden_transform);
+                commute_through_quantization(op, first, false, &commute_shape, &golden_transform);
             }
             else 
             {
diff --git a/pybuda/csrc/passes/remove_quant_dequant.cpp b/pybuda/csrc/passes/remove_quant_dequant.cpp
new file mode 100644
index 00000000..0e061337
--- /dev/null
+++ b/pybuda/csrc/passes/remove_quant_dequant.cpp
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/dequant_quant_to_requant.hpp"
+#include "reportify/reportify.hpp"
+
+namespace tt::passes 
+{
+
+void bypass_qdq_pair(graphlib::Graph *graph, graphlib::OpNode *quantize, graphlib::OpNode *dequantize) {
+    TT_ASSERT(quantize->op_type().op == "quantize" and dequantize->op_type().op == "dequantize", "Improper ops passed.");
+    TT_ASSERT(graph->data_users(dequantize).size() == 1, "Only support dequant with one child, quantize");
+
+    // Purge the graph of all nodes that solely feed the scales of the quantize or dequantize
+    auto purge_scale_graph = [graph](graphlib::Node *scale) {
+        std::vector<graphlib::Node *> nodes_to_check{scale};
+        std::vector<graphlib::Node *> nodes_to_remove;
+        while (nodes_to_check.size() > 0) {
+            graphlib::Node* to_check = nodes_to_check.back();
+            nodes_to_check.pop_back();
+
+            if (graph->data_users(to_check).size() > 1) {
+                continue;
+            } else {
+                for (graphlib::Node *operand : graph->data_operands(to_check))
+                    nodes_to_check.push_back(operand);
+                nodes_to_remove.push_back(to_check);
+            }
+        }
+        for (graphlib::Node *node : nodes_to_remove) {
+            graph->remove_node(node);
+        }
+    };
+
+    // Purge the scale of one before the other. This way if both quant and dequant point to the same scale (directly or indirectly),
+    // the first call to purge_scale_graph will do nothing as the scale has multiple users. After the quantize
+    // is bypassed, when wecall purge_scale_graph again the sale will be erased as the edge that was once
+    // pointing to the quantize is gone (thanks to bypass_node).
+    purge_scale_graph(graph->data_operands(quantize)[1]);
+    bypass_node(graph, quantize, true);
+    purge_scale_graph(graph->data_operands(dequantize)[1]);
+    bypass_node(graph, dequantize, true);
+}
+
+bool remove_quant_dequant(graphlib::Graph *graph) {
+    
+    bool attempt_update = true;
+    bool graph_changed = false;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *op_node = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op_node)
+                continue;
+            
+            if (graph->data_users(op_node).size() != 1)
+                continue;
+
+            graphlib::OpNode *op_child = dynamic_cast<graphlib::OpNode *>(graph->data_users(op_node)[0]);
+            if (not op_child)
+                continue;
+            
+            // Dequantize should only have one user edge going into the dequantize
+            graphlib::Edge user_edge = graph->user_data_edges(op_node)[0];
+            if (graph->get_edge_attributes(user_edge)->get_tms().size() > 0)
+                continue;
+
+            // Must be a dequantize followed by a quantize
+            if (op_node->op_type().op != "quantize" or op_child->op_type().op != "dequantize")
+                continue;
+
+
+            // Quantize should be producing an int8
+            // if (std::get<std::string>(op_child->op_attrs()[4]) != std::string("torch.int8"))
+            //     continue;
+
+            bypass_qdq_pair(graph, op_node, op_child);
+            graph_changed = true;
+            attempt_update = true;
+            break;
+
+        }
+    }
+
+    return graph_changed;
+}
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/remove_quant_dequant.hpp b/pybuda/csrc/passes/remove_quant_dequant.hpp
new file mode 100644
index 00000000..7ee4a393
--- /dev/null
+++ b/pybuda/csrc/passes/remove_quant_dequant.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+bool remove_quant_dequant(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/pybuda/op/eval/buda/matmul.py b/pybuda/pybuda/op/eval/buda/matmul.py
index 490a2424..fe7ff180 100644
--- a/pybuda/pybuda/op/eval/buda/matmul.py
+++ b/pybuda/pybuda/op/eval/buda/matmul.py
@@ -354,6 +354,11 @@ def shape(type, attr, ops, tile_height, tile_width):
             print(ops)
             assert False, f"If inner dimension is not the same for matmul, one of operands must have it be {TILE_DIM}."
 
+    if len(ops) == 3:
+        for i in range(len(output_shape)):
+            if output_shape[i] != ops[2][i]:
+                broadcast.append((2, i, output_shape[i]))
+
     if accumulate:
         output_shape[-3] = 1
 
diff --git a/pybuda/pybuda/op/eval/buda/quantize.py b/pybuda/pybuda/op/eval/buda/quantize.py
index 68f823fe..94adf5e3 100644
--- a/pybuda/pybuda/op/eval/buda/quantize.py
+++ b/pybuda/pybuda/op/eval/buda/quantize.py
@@ -61,12 +61,12 @@ def shape(type, attr, ops, tile_height, tile_width):
             op1 = [1] + op1
         for dim in range(1, len(ops[0])):
             if ops[0][dim] != op1[dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
+                broadcast.append((1, dim, ops[0][dim]))
 
     if type == "requantization":
         for dim in range(1, len(ops[0])):
             if ops[0][dim] != ops[1][dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
+                broadcast.append((1, dim, ops[0][dim]))
 
     if type == "dequantization":
         op1 = list(ops[1])
@@ -74,7 +74,7 @@ def shape(type, attr, ops, tile_height, tile_width):
             op1 = [1] + op1
         for dim in range(1, len(ops[0])):
             if ops[0][dim] != op1[dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
+                broadcast.append((1, dim, ops[0][dim]))
 
     return ops[0], broadcast
 
diff --git a/pybuda/pybuda/op/eval/buda/tm.py b/pybuda/pybuda/op/eval/buda/tm.py
index 9c5a4cc8..90961e68 100644
--- a/pybuda/pybuda/op/eval/buda/tm.py
+++ b/pybuda/pybuda/op/eval/buda/tm.py
@@ -138,13 +138,13 @@ def eval(type, attr, ops):
 
     if type == "broadcast":
 
-        assert len(attr) <= 3, "Broadcast should have two attributes - dim and size"
+        assert len(attr) <= 3, "Broadcast should have two attributes - dim and factor"
         dim = attr[0]
         factor = attr[1]
         assert dim > 0, "Don't support broadcasting on w"
 
         if t_ops[0].is_sparse:
-            return bcast_sparse_picker_matrix(t_ops[0], dim, size)
+            return bcast_sparse_picker_matrix(t_ops[0], dim, factor)
 
         sizes = [1] * len(t_ops[0].shape)
         sizes[dim] = factor
diff --git a/pybuda/pybuda/op/eval/pybuda/matmul.py b/pybuda/pybuda/op/eval/pybuda/matmul.py
index 7c3be5b6..f3886075 100644
--- a/pybuda/pybuda/op/eval/pybuda/matmul.py
+++ b/pybuda/pybuda/op/eval/pybuda/matmul.py
@@ -26,6 +26,7 @@ def eval(type, attr, ops):
     t_ops, original_type = cast_for_cpu_eval(t_ops, type)
 
     if type == "matmul":
+        # import pdb; pdb.set_trace()
         result = torch.matmul(t_ops[0], t_ops[1])
         result = result.to(original_type)
         if len(t_ops) > 2:
diff --git a/pybuda/pybuda/op/eval/pybuda/quantize.py b/pybuda/pybuda/op/eval/pybuda/quantize.py
index 7ad663fa..f92a205b 100644
--- a/pybuda/pybuda/op/eval/pybuda/quantize.py
+++ b/pybuda/pybuda/op/eval/pybuda/quantize.py
@@ -34,45 +34,54 @@
     "torch.float32": 3.4028234663852886e+38,
 }
 
+
+def reshape_to_match_input(parameter, input_shape, axis):
+    if axis < 0:
+        axis = len(input_shape) + axis
+        
+    if len(parameter.shape) == 1:
+        left_ndim = axis
+        right_ndim = len(input_shape) - axis - 1
+        target_shape = [1] * left_ndim + list(parameter.shape) + [1] * right_ndim
+        if target_shape[axis] != input_shape[axis]:
+            assert target_shape[axis] == 1
+            parameter = torch.broadcast_to(parameter, target_shape)
+        
+        parameter = torch.reshape(parameter, target_shape)
+    
+    return parameter
+
+
 def eval(type, attr, ops):
     if type == "quantize":
         zero_point, axis, out_dtype = attr
         input_float = ops[0].float()
         scale = ops[1].float()
+        scale = reshape_to_match_input(scale, input_float.shape, axis)
 
-        output_float = torch.clamp(
+        output_int = torch.clamp(
             torch.round(input_float / scale) + zero_point,
             STRING_TO_LOWER_LIMIT[out_dtype],
             STRING_TO_UPPER_LIMIT[out_dtype],)
-        return output_float.to(STRING_TO_TORCH_DTYPE[out_dtype])
+        return output_int.to(STRING_TO_TORCH_DTYPE[out_dtype])
 
     elif type == "buda_quantize":
         zero_point, axis, out_dtype = attr
         input_float = ops[0].float()
         scale = ops[1].float()
-        output_float = torch.clamp(
+        output_int = torch.clamp(
             torch.round(input_float * scale) + zero_point,
             STRING_TO_LOWER_LIMIT[out_dtype],
             STRING_TO_UPPER_LIMIT[out_dtype],)
-        return output_float.to(STRING_TO_TORCH_DTYPE[out_dtype])
+        
+        return output_int.to(STRING_TO_TORCH_DTYPE[out_dtype])
 
     elif type == "dequantize":
         zero_point, axis = attr
         input_int8 = ops[0].float()
         scale = ops[1].float()
-
-        if axis < 0:
-            axis = len(input_int8.shape) + axis
-        left_ndim = axis
-        right_ndim = len(input_int8.shape) - axis - 1
-        if len(scale.shape) == 1:
-            target_shape = [1] * left_ndim + list(scale.shape) + [1] * right_ndim
-
-        if target_shape[axis] != input_int8.shape[axis]:
-            assert target_shape[axis] == 1
-            scale = torch.broadcast_to(scale, target_shape)
-        scale = torch.reshape(scale, target_shape)
-
+        scale = reshape_to_match_input(scale, input_int8.shape, axis)
+        
         output_float = (input_int8 - zero_point) * scale
         return output_float
 
@@ -89,46 +98,36 @@ def eval(type, attr, ops):
         inp_scale, out_scale, = ops[1], ops[2]
         output_scale = inp_scale / out_scale
 
-        if axis < 0:
-            axis = len(input_int32.shape) + axis
-        left_ndim = axis
-        right_ndim = len(input_int32.shape) - axis - 1
-        if len(output_scale.shape) == 1:
-            target_shape = [1] * left_ndim + list(output_scale.shape) + [1] * right_ndim
-
-        if target_shape[axis] != input_int32.shape[axis]:
-            assert target_shape[axis] == 1
-            output_scale = torch.broadcast_to(output_scale, target_shape)
-        output_scale = torch.reshape(output_scale, target_shape)
-
+        output_scale = reshape_to_match_input(output_scale, input_int32.shape, axis)
 
         assert inp_zp == 0, "Only support input zero point of 0"
-        output_float = torch.round(output_scale * (input_int32 - inp_zp) + out_zp)
-        output_float = torch.clamp(
-            output_float,
+        output_int = torch.round(output_scale * (input_int32 - inp_zp) + out_zp)
+        output_int = torch.clamp(
+            output_int,
             STRING_TO_LOWER_LIMIT[out_dtype],
             STRING_TO_UPPER_LIMIT[out_dtype],)
 
-        return output_float.to(STRING_TO_TORCH_DTYPE[out_dtype])
+        return output_int.to(STRING_TO_TORCH_DTYPE[out_dtype])
 
     elif type == "buda_requantize":
         zp, axis, rounding, out_dtype = attr
         input_int32 = ops[0].float()
         scale = ops[1].float()
-        output_float = torch.round(input_int32 * scale + zp)
-        output_float = torch.clamp(
-            output_float,
+        output_int = torch.round(input_int32 * scale + zp)
+        output_int = torch.clamp(
+            output_int,
             STRING_TO_LOWER_LIMIT[out_dtype],
             STRING_TO_UPPER_LIMIT[out_dtype],)
 
-        return output_float.to(STRING_TO_TORCH_DTYPE[out_dtype])
+        return output_int.to(STRING_TO_TORCH_DTYPE[out_dtype])
 
 def shape(type, attr, ops):
     broadcast = []
     op0 = ops[0]
     op1 = ops[1]
 
-    if type == "quantize" or type == "buda_quantize":
+    # NOTE: Do not want to insert broadcasts on the scales of non 'buda_' quantize ops
+    if type == "buda_quantize":
         axis = attr[1]
         if axis < 0:
             axis = len(ops[0]) + axis
@@ -142,13 +141,18 @@ def shape(type, attr, ops):
         assert len(op1) == len(op0), "Scale and input must have same dimension"
         for dim in range(1, len(op0)):
             if op0[dim] != op1[dim]:
-                broadcast.append((1, dim - len(op0), op0[dim]))
+                broadcast.append((1, dim, op0[dim]))
+            elif op0[dim] == 1:  # We broadcast even if dims are both one in order to use unsqueeze from broadcast function
+                broadcast.append((1, dim, 1))
 
     if type == "buda_requantize" or type == "buda_dequantize":
-        for dim in range(1, len(ops[0])):
-            if ops[0][dim] != ops[1][dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
-    return ops[0], broadcast
+        for dim in range(1, len(op0)):
+            if op0[dim] != op1[dim]:
+                broadcast.append((1, dim, op0[dim]))
+            elif op0[dim] == 1:  # We broadcast even if dims are both one in order to use unsqueeze from broadcast function
+                broadcast.append((1, dim, 1))
+
+    return ops[0], []
 
 
 def lower(type, attr, lc, ops, outputs):
@@ -170,8 +174,31 @@ def decompose(type, attr, dc, inputs):
         zero_point, axis, out_dtype = attr
         torch_dtype = STRING_TO_TORCH_DTYPE[out_dtype]
         buda_dtype = pytorch_dtype_to_buda_dataformat(torch_dtype)
+        act = inputs[0]
         scale = inputs[1]
         scale = dc.op(Reciprocal.create(), [scale], output_df=scale.output_df)
+        # Need to unsqueeze shape so dim is on the proper axis
+        if axis < 0:
+            axis = len(act.shape) + axis
+        left_ndim = axis
+        right_ndim = len(act.shape) - axis - 1
+
+        scale_shape = scale.shape.as_list()
+        if len(scale_shape) == 1:
+            # Match ndim with actiavtion
+            for i in range(0, left_ndim):
+                scale = dc.op("unsqueeze", [scale], attrs=(0, len(scale_shape)), output_df=scale.output_df)
+                scale_shape = [1] + scale_shape
+            for i in range(0, right_ndim):
+                scale = dc.op("unsqueeze", [scale], attrs=(len(scale_shape), len(scale_shape)), output_df=scale.output_df)
+                scale_shape = scale_shape + [1]
+
+        for i in range(len(scale_shape)):
+            if scale_shape[i] != act.shape[i]:
+                assert scale_shape[i] == 1 and act.shape[i] > 1, f"Cannot broadcast ({scale_shape[i]}) to ({act.shape[i]})"
+                scale = dc.op("broadcast", [scale], attrs=(i-len(scale_shape), act.shape[i]), output_df=scale.output_df)
+                scale_shape = list(scale.shape)
+
         out = dc.op("buda_quantize", [inputs[0], scale], attrs=attr, output_df=buda_dtype)
         dc.fuse(out)
         return
@@ -205,10 +232,17 @@ def decompose(type, attr, dc, inputs):
                 out_scale_shape = out_scale_shape + [1]
 
 
-        if out_scale_shape[axis] != act.shape[axis]:
-            assert out_scale_shape[axis] == 1
-            out_scale = dc.op("broadcast", [out_scale], attrs=(axis - len(out_scale_shape), act.shape[axis]),output_df=out_scale.output_df)
-            out_scale_shape[axis] = act.shape[axis]
+        for i in range(len(out_scale_shape)):
+            if out_scale_shape[i] != act.shape[i]:
+                assert out_scale_shape[i] == 1 and act.shape[i] > 1, f"Cannot broadcast ({out_scale_shape[i]}) to ({act.shape[i]})"
+                out_scale = dc.op("broadcast", [out_scale], attrs=(i-len(out_scale_shape), act.shape[i]), output_df=out_scale.output_df)
+                out_scale_shape = list(out_scale.shape)
+
+            if inp_scale_shape[i] != act.shape[i]:
+                assert inp_scale_shape[i] == 1 and act.shape[i] > 1, f"Cannot broadcast ({inp_scale_shape[i]}) to ({act.shape[i]})"
+                inp_scale = dc.op("broadcast", [inp_scale], attrs=(i-len(inp_scale_shape), act.shape[i]), output_df=inp_scale.output_df)
+                inp_scale_shape = list(inp_scale.shape)
+
 
         recip_out_scale = dc.op(Reciprocal.create(), [out_scale],output_df=out_scale.output_df,)    
         new_scale = dc.op("multiply", [inp_scale, recip_out_scale],output_df=out_scale.output_df,)
@@ -238,6 +272,11 @@ def decompose(type, attr, dc, inputs):
                 scale = dc.op("unsqueeze", [scale], attrs=(len(scale_shape), len(scale_shape)), output_df=scale.output_df)
                 scale_shape = scale_shape + [1]
 
+        for i in range(len(scale_shape)):
+            if scale_shape[i] != act.shape[i]:
+                assert scale_shape[i] == 1 and act.shape[i] > 1, f"Cannot broadcast ({scale_shape[i]}) to ({act.shape[i]})"
+                scale = dc.op("broadcast", [scale], attrs=(i-len(scale_shape), act.shape[i]), output_df=scale.output_df)
+                scale_shape = list(scale.shape)
 
         out = dc.op("buda_dequantize", [act, scale], attrs=attr,)
         dc.fuse(out)
diff --git a/pybuda/pybuda/op/eval/pybuda/tm.py b/pybuda/pybuda/op/eval/pybuda/tm.py
index 5aa245be..832f07a7 100644
--- a/pybuda/pybuda/op/eval/pybuda/tm.py
+++ b/pybuda/pybuda/op/eval/pybuda/tm.py
@@ -183,8 +183,14 @@ def eval(type, attr, ops):
         tensor = t_ops[0]
         dim = attr[0]
         size = attr[1]
-        while len(tensor.shape) <= ((-dim - 1) if dim < 0 else dim):
-            tensor = tensor.unsqueeze(0)
+
+        if dim < 0:
+            while len(tensor.shape) <= ((-dim - 1)):
+                tensor = tensor.unsqueeze(0)
+        else:
+            while len(tensor.shape) <=  dim:
+                tensor = tensor.unsqueeze(-1)
+        
         target_shape = list(tensor.shape)
         assert dim < len(
             target_shape
diff --git a/pybuda/pybuda/python_codegen.py b/pybuda/pybuda/python_codegen.py
index 565813a0..5229c456 100644
--- a/pybuda/pybuda/python_codegen.py
+++ b/pybuda/pybuda/python_codegen.py
@@ -89,7 +89,7 @@ def __init__(self, module_name, open_file=True):
             self.file = open(os.path.join(self.module_directory, self.filename), "w")
         self.indent = 0
         self.module_name = module_name
-        self.class_name = module_name.title().replace("_", "")
+        self.class_name = module_name.title().replace("_", "").replace("-", "")
 
     def wl(self, text):
         indent = self.indent * "    "
diff --git a/pybuda/pybuda/tvm_to_python.py b/pybuda/pybuda/tvm_to_python.py
index e686013d..6c3244d0 100644
--- a/pybuda/pybuda/tvm_to_python.py
+++ b/pybuda/pybuda/tvm_to_python.py
@@ -1911,18 +1911,39 @@ def make_parser_friendly_name(node, node_type):
                     assert zp_node['nid'] in constants
                     zp_value = json_graph["params"][zp_node_name]
                     del constants[zp_node["nid"]]
-                    args.append(("zero_point", f"{float(zp_value.item())}"))
+                    if zp_value.size == 1:
+                        args.append(("zero_point", f"{float(zp_value.item())}"))
+                    else:
+                        args.append(("zero_point", f"{float(zp_value[0].item())}"))
                     node["attrs"]["num_inputs"] = '2'
 
                 if node["name"] == "qnn.dequantize":
                     assert int(node["attrs"]["num_inputs"]) == 3
 
                     zp_node = graph["nodes"][node["inputs"][2][0]]
-                    zp_node_name = zp_node['name']
+                    
+                    # In case tvm added an op (such as cast) between zp and dequantize
+                    if zp_node['op'] != 'constant':
+                        if 'inputs' in zp_node:
+                            zp_node_input = graph['nodes'][zp_node['inputs'][0][0]]
+                            if zp_node_input['op'] == 'constant':
+                                zp_node = zp_node_input
+                                zp_value = torch.tensor([0])
+                                if 'users' in zp_node:
+                                    users = zp_node['users']
+                                    for user in users:
+                                        if user in ops:
+                                            del ops[user]
+                    else:
+                        zp_node_name = zp_node['name']
+                        zp_value = json_graph["params"][zp_node_name]
+
                     assert zp_node['nid'] in constants
-                    zp_value = json_graph["params"][zp_node_name]
                     del constants[zp_node["nid"]]
-                    args.append(("zero_point", f"{zp_value.item()}"))
+                    if zp_value.size == 1:
+                        args.append(("zero_point", f"{zp_value.item()}"))
+                    else:
+                        args.append(("zero_point", f"{float(zp_value[0].item())}"))
                     node["attrs"]["num_inputs"] = '2'
 
                 if node["name"] == "qnn.requantize":
@@ -1937,7 +1958,10 @@ def make_parser_friendly_name(node, node_type):
                     out_zp_node_name = out_zp_node['name']
                     assert out_zp_node['nid'] in constants
                     out_zp_value = json_graph["params"][out_zp_node_name]
-                    args.append(("output_zero_point", f"{out_zp_value.item()}"))
+                    if zp_value.size == 1:
+                        args.append(("output_zero_point", f"{out_zp_value.item()}"))
+                    else:
+                        args.append(("output_zero_point", f"{float(out_zp_value[0].item())}"))
 
                     node["inputs"] = [node["inputs"][0], node["inputs"][1], node["inputs"][3]]
                     del constants[inp_zp_node["nid"]]
diff --git a/pybuda/test/quantized/test_onnx_qdq_commute.py b/pybuda/test/quantized/test_onnx_qdq_commute.py
new file mode 100644
index 00000000..3dd7ccb0
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_qdq_commute.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import onnx
+import onnxruntime as ort
+import pytest
+import torch
+import numpy as np
+
+from pybuda import (
+    BackendType,
+    VerifyConfig,
+)
+import pybuda 
+from pybuda.config import _get_global_compiler_config
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+
+
+def test_int8_simple_conv(test_device):
+
+    if test_device.arch != pybuda.BackendDevice.Wormhole_B0:
+        pytest.skip('Currently works only on Wormhole_B0')
+
+    # Load ONNX model
+    load_path = "third_party/confidential_customer_models/quantized/simple_conv.onnx"
+    if not os.path.exists(load_path):
+        raise RuntimeError("Model not found")
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule("int8_simple_conv", model, load_path)
+
+    # Define inputs
+    input_shape = (1, 3, 32, 32)
+    input_tensor = torch.rand(input_shape)
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    #compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.dont_fuse(['dequantize_18.dc.buda_dequantize.3', 'dequantize_8.dc.buda_dequantize.3'])
+    
+    # Sanity run
+    session = ort.InferenceSession(load_path)
+    input_name = session.get_inputs()[0].name
+    output_name = session.get_outputs()[0].name
+    output = session.run([output_name], {input_name: np.random.randn(*input_shape).astype(np.float32)})
+    print("Sanity run output:", output)
+
+    # Compile and verify
+    pcc = 0.97 if test_device.devtype == BackendType.Silicon else 0.99
+    verify_module(
+        tt_model,
+        input_shapes=([input_shape]),
+        inputs=([input_tensor]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_all=True,
+            verify_pipeline_result_vs_framework=True,
+            verify_pybuda_codegen_vs_framework=True,
+            pcc=pcc
+        ),
+    )
+
+
+def test_int8_skip_conv(test_device):
+
+    if test_device.arch != pybuda.BackendDevice.Wormhole_B0:
+        pytest.skip('Currently works only on Wormhole_B0')
+
+    # Load ONNX model
+    load_path = "third_party/confidential_customer_models/quantized/skip_conv.onnx"
+    if not os.path.exists(load_path):
+        raise RuntimeError("Model not found")
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule("int8_skip_conv", model, load_path)
+
+    # Define inputs
+    input_shape = (1, 3, 32, 32)
+    input_tensor = torch.rand(input_shape)
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    #compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.dont_fuse(['dequantize_18.dc.buda_dequantize.3', 'dequantize_8.dc.buda_dequantize.3'])
+    
+    # Sanity run
+    session = ort.InferenceSession(load_path)
+    input_name = session.get_inputs()[0].name
+    output_name = session.get_outputs()[0].name
+    output = session.run([output_name], {input_name: np.random.randn(*input_shape).astype(np.float32)})
+    print("Sanity run output:", output)
+
+    # Compile and verify
+    pcc = 0.97 if test_device.devtype == BackendType.Silicon else 0.99
+    verify_module(
+        tt_model,
+        input_shapes=([input_shape]),
+        inputs=([input_tensor]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_all=True,
+            verify_pipeline_result_vs_framework=True,
+            verify_pybuda_codegen_vs_framework=True,
+            pcc=pcc
+        ),
+    )
+
diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
index f80bb50e..e6b09015 100644
--- a/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
+++ b/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
@@ -17,6 +17,7 @@
     BackendDevice,
     BackendType,
 )
+from pybuda._C import MathFidelity
 from pybuda.verify import verify_module
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
@@ -94,7 +95,7 @@ def test_onnx_quantized_mb_v2(test_device):
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
-    compiler_cfg.place_on_new_epoch("conv2d_118.dc.reshape.0.dc.sparse_matmul.14.lc2")
+    compiler_cfg.place_on_new_epoch("conv2d_122.dc.reshape.0.dc.sparse_matmul.14.lc2")
     os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
     os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
@@ -121,4 +122,51 @@ def test_onnx_quantized_mb_v2(test_device):
             # verify_pybuda_codegen_vs_framework=True,
             # verify_all=True
         ),
+    )
+
+def test_onnx_qdq_mobilenet(test_device):
+    # pytest.skip("Models not yet uploaded")
+    pytest.skip("WIP")
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/mobilenetv2_ptq_qdq.onnx"
+
+    onnx_model = onnx.load(save_path)
+    # onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_mobilenet_v2",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            # verify_pybuda_codegen_vs_framework=True,
+            verify_all=True,
+        ),
     )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_resnet.py b/pybuda/test/quantized/test_onnx_quantized_resnet.py
index bfbf16c0..e63b3e7a 100644
--- a/pybuda/test/quantized/test_onnx_quantized_resnet.py
+++ b/pybuda/test/quantized/test_onnx_quantized_resnet.py
@@ -16,6 +16,7 @@
     DataFormat,
     BackendDevice,
 )
+from pybuda._C import MathFidelity
 from pybuda.verify import verify_module
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
@@ -100,3 +101,48 @@ def test_onnx_quantized_resnet(test_device):
         ),
     )
 
+def test_onnx_qdq_resnet(test_device):
+    pytest.skip("WIP")
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/resnet50_ptq_qdq.onnx"
+
+    onnx_model = onnx.load(save_path)
+    # onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_ResNet50",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            # verify_pybuda_codegen_vs_framework=True,
+            verify_all=True,
+        ),
+    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_vit.py b/pybuda/test/quantized/test_onnx_quantized_vit.py
index 7601c77d..667cbe5f 100644
--- a/pybuda/test/quantized/test_onnx_quantized_vit.py
+++ b/pybuda/test/quantized/test_onnx_quantized_vit.py
@@ -12,11 +12,13 @@
     BackendDevice,
     BackendType,
 )
+from pybuda._C import MathFidelity
 from pybuda.verify import verify_module
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
 
 def test_int8_onnx_vit_calibrated(test_device):
+    pytest.skip("Not continuing support for QOperator models")
     # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
     if test_device.arch == BackendDevice.Blackhole:
         pytest.skip("Blackhole does not support quantized models")
@@ -69,3 +71,51 @@ def test_int8_onnx_vit_calibrated(test_device):
             pcc=pcc,
         ),
     )
+
+
+def test_onnx_qdq_vit(test_device):
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/vit_b_16_ptq_qdq.onnx"
+
+    onnx_model = onnx.load(save_path)
+    # onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_vit",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
+    compiler_cfg.retain_tvm_python_files = True
+    compiler_cfg.convert_framework_params_to_tvm = True
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            # verify_pybuda_codegen_vs_framework=True,
+            verify_all=True,
+        ),
+    )
\ No newline at end of file

From 21a6a4c88b0d37cf7346f60e2845073f73c0aaad Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 19 Jul 2024 17:41:40 +0000
Subject: [PATCH 041/116] Add BOS models and also fix
 confidential_customer_model hash

(cherry picked from commit cf7b8ee1c2487039316f70a475811d70143ffc75)
---
 pybuda/test/quantized/test_onnx_quantized_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/test/quantized/test_onnx_quantized_vit.py b/pybuda/test/quantized/test_onnx_quantized_vit.py
index 667cbe5f..c43382cd 100644
--- a/pybuda/test/quantized/test_onnx_quantized_vit.py
+++ b/pybuda/test/quantized/test_onnx_quantized_vit.py
@@ -80,7 +80,7 @@ def test_onnx_qdq_vit(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull does not support quantized models")
 
-    save_path = "third_party/confidential_customer_models/bos/vit_b_16_ptq_qdq.onnx"
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/vit_b_16/vit_b_16_ptq_qdq.onnx"
 
     onnx_model = onnx.load(save_path)
     # onnx.checker.check_model(onnx_model)

From be36e651861296807b7bc69cdfb4dd0f1485822b Mon Sep 17 00:00:00 2001
From: Guangyu Feng <gfeng@tenstorrent.com>
Date: Fri, 19 Jul 2024 15:47:39 +0000
Subject: [PATCH 042/116] fix galaxy sanity test

update cluster desc file with the new format

(cherry picked from commit 8dafb0ea90de28c3ab05eb23aa3760df36270218)
---
 pybuda/test/galaxy/one_shelf_eth_connections.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pybuda/test/galaxy/one_shelf_eth_connections.yaml b/pybuda/test/galaxy/one_shelf_eth_connections.yaml
index a6d3b84e..3c8e0374 100644
--- a/pybuda/test/galaxy/one_shelf_eth_connections.yaml
+++ b/pybuda/test/galaxy/one_shelf_eth_connections.yaml
@@ -245,7 +245,7 @@ ethernet_connections: [
 ]
 
 # harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc...
-harvesting: [
+harvesting: {
   0: {noc_translation: True, harvest_mask: 0},
   1: {noc_translation: True, harvest_mask: 0},
   2: {noc_translation: True, harvest_mask: 0},
@@ -278,7 +278,7 @@ harvesting: [
   29: {noc_translation: True, harvest_mask: 0},
   30: {noc_translation: True, harvest_mask: 0},
   31: {noc_translation: True, harvest_mask: 0},
-]
+}
 
 chips_with_mmio: [
   0: 0,

From b7b8874b9d9d83cd0b3bfed7583d4927a7619bcb Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 18 Jul 2024 12:02:38 +0000
Subject: [PATCH 043/116] Move binary models

Issue #2554 / #2767

(cherry picked from commit 14e0095481982afd065e0d3f1e78b4b3cfe37e2b)
---
 .../models/test_plan/__init__.py              |  11 -
 .../test_plan/model_op_src_const_eval_pass.py |  45 ----
 .../test_plan/model_op_src_from_another_op.py |  32 ---
 .../test_plan/model_op_src_from_dram_queue.py |  23 --
 .../model_op_src_from_dram_queue_prologued.py |  35 ---
 .../test_plan/model_op_src_from_host.py       |  23 --
 .../test_plan/model_op_src_from_tm_edge1.py   |  29 ---
 .../test_plan/model_op_src_from_tm_edge2.py   |  30 ---
 .../eltwise_binary/test_eltwise_binary.py     | 208 +++++++++++++++---
 9 files changed, 182 insertions(+), 254 deletions(-)
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
 delete mode 100644 pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py

diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py b/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
deleted file mode 100644
index 580a3bd3..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-from .model_op_src_from_another_op import BudaElementWiseBinaryTest
-from .model_op_src_from_tm_edge1 import BudaElementWiseBinaryTest
-from .model_op_src_from_tm_edge2 import BudaElementWiseBinaryTest
-from .model_op_src_from_host import BudaElementWiseBinaryTest
-from .model_op_src_from_dram_queue import BudaElementWiseBinaryTest
-from .model_op_src_from_dram_queue_prologued import BudaElementWiseBinaryTest
-from .model_op_src_const_eval_pass import BudaElementWiseBinaryTest
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
deleted file mode 100644
index 5007e2dc..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_const_eval_pass.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   when operand sorce from constants input
-
-import pybuda
-import torch
-
-from pybuda import PyBudaModule
-from test.operators.utils import ShapeUtils
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src const eval pass")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src const eval pass"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-        def my_rand(*shape, requires_grad=False):
-            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
-        
-        self.constant_shape = ShapeUtils.reduce_microbatch_size(shape)
-
-        self.add_constant("c1")
-        self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
-
-        self.add_constant("c2")
-        self.set_constant("c2", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
-       
-        self.inputs = [
-            pybuda.Tensor.create_from_torch(my_rand(*self.shape))
-        ]
-
-    def forward(self, x, y):
-        v1 = self.operator(self.opname + "0", self.get_constant("c1"), self.get_constant("c2"))
-        # v2 and v3 consume inputs
-        v2 = pybuda.op.Add("Add1", x, y)
-        v3 = pybuda.op.Add("Add2", v1, v2)
-        return v3
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
deleted file mode 100644
index 6d33d453..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_another_op.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   when operand sorce is from another operator
-
-import pybuda
-
-from pybuda import PyBudaModule
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from another op")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from another op"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-    def forward(self, x, y):
-        # we use Add and Subtract operators to create two operands which are inputs for the binary operator
-        xx = pybuda.op.Add("Add0", x, y)
-        yy = pybuda.op.Subtract("Subtract0", x, y)
-        output = self.operator(self.opname + "1", xx, yy)
-        return output
-
-    # TODO: check do we need this
-    # def values(self):
-    #     return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
deleted file mode 100644
index c497ffd4..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   when operand sorce is from dram queue
-
-from pybuda import PyBudaModule
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-    def forward(self, x, y):
-        output = self.operator(self.opname + "0", x, y)
-        return output
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
deleted file mode 100644
index ad30217c..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_dram_queue_prologued.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   when operand sorce is from dram queue
-
-import pybuda
-import torch
-
-from pybuda import PyBudaModule
-from test.operators.utils import ShapeUtils
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue prologued")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue prologued"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-        def my_rand(*shape, requires_grad=False):
-            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
-
-        self.shape_input = ShapeUtils.reduce_microbatch_size(shape)
-
-        self.add_constant("c")
-        self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
-
-    def forward(self, x):
-        output = self.operator(self.opname + "0", self.get_constant("c"), x)
-        return output
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
deleted file mode 100644
index 1de7810a..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_host.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   when operand sorce is from host
-
-from pybuda import PyBudaModule
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from host")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from host"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-    def forward(self, x, y):        
-        output = self.operator(self.opname + "0", x, y)
-        return output
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
deleted file mode 100644
index 8f8ec076..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge1.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   and when operand sorce is from tm edge
-#   Combination: operator -> tm -> input
-
-
-import pybuda
-
-from pybuda import PyBudaModule
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge1")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge1"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-    def forward(self, x, y):
-        xx = pybuda.op.Add("Add0", x, y)
-        yy = pybuda.op.tm.Transpose("Transpose0", xx, -1, -2)
-        output = self.operator(self.opname + "1", yy, yy)
-        return output
diff --git a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py b/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py
deleted file mode 100644
index 5f9948df..00000000
--- a/pybuda/test/operators/eltwise_binary/models/test_plan/model_op_src_from_tm_edge2.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-
-#   Model for testing element-wise binary operators
-#   and when operand sorce is from tm edge
-#   Combination: - tm -> input
-
-
-import pybuda
-
-from pybuda import PyBudaModule
-
-
-class BudaElementWiseBinaryTest(PyBudaModule):
-
-    def __init__(self, operator, opname, shape):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge2")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge2"
-        self.operator = operator
-        self.opname = opname
-        self.shape = shape
-
-    def forward(self, x, y):
-        # 
-        xx = pybuda.op.tm.Transpose("Transpose0", x, -1, -2)
-        yy = pybuda.op.tm.Transpose("Transpose1", y, -1, -2)
-        output = self.operator(self.opname + "2", xx, yy)
-        return output
diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index a95f4d78..c7311257 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -61,27 +61,174 @@
 import pytest
 import numpy as np
 
-from typing import List, Dict
+from typing import List, Dict, Type
 from loguru import logger
 
+import torch
 import pybuda
 import pybuda.op
 
+from pybuda import PyBudaModule
 from pybuda.op_repo import TensorShape
 from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import ShapeUtils
 from test.conftest import TestDevice
 
 from pybuda import TTDevice, pybuda_compile, VerifyConfig, CompilerConfig
 
-from . import models
-from .models import test_plan
+# from . import models
+# from .models import test_plan
 
-TEST_PLAN_MODELS_PATH = "./pybuda/test/operators/eltwise_binary/models/test_plan/"
+
+class ModelFromAnotherOp(PyBudaModule):
+
+    model_name = "model_op_src_from_another_op"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from another op")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from another op"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        # we use Add and Subtract operators to create two operands which are inputs for the binary operator
+        xx = pybuda.op.Add("Add0", x, y)
+        yy = pybuda.op.Subtract("Subtract0", x, y)
+        output = self.operator(self.opname + "1", xx, yy)
+        return output
+
+
+class ModelFromHost(PyBudaModule):
+
+    model_name = "model_op_src_from_host"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from host")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from host"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        output = self.operator(self.opname + "0", x, y)
+        return output
+
+
+class ModelFromDramQueue(PyBudaModule):
+
+    model_name = "model_op_src_from_dram_queue"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        output = self.operator(self.opname + "0", x, y)
+        return output
+
+
+class ModelFromDramQueuePrologued(PyBudaModule):
+
+    model_name = "model_op_src_from_dram_queue_prologued"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue prologued")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue prologued"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+        def my_rand(*shape, requires_grad=False):
+            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+
+        self.shape_input = ShapeUtils.reduce_microbatch_size(shape)
+
+        self.add_constant("c")
+        self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
+
+    def forward(self, x):
+        output = self.operator(self.opname + "0", self.get_constant("c"), x)
+        return output
+
+
+class ModelConstEvalPass(PyBudaModule):
+
+    model_name = "model_op_src_const_eval_pass"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src const eval pass")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src const eval pass"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+        def my_rand(*shape, requires_grad=False):
+            return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
+        
+        self.constant_shape = ShapeUtils.reduce_microbatch_size(shape)
+
+        self.add_constant("c1")
+        self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
+
+        self.add_constant("c2")
+        self.set_constant("c2", pybuda.Tensor.create_from_torch(my_rand(*self.constant_shape), constant=True))
+       
+        self.inputs = [
+            pybuda.Tensor.create_from_torch(my_rand(*self.shape))
+        ]
+
+    def forward(self, x, y):
+        v1 = self.operator(self.opname + "0", self.get_constant("c1"), self.get_constant("c2"))
+        # v2 and v3 consume inputs
+        v2 = pybuda.op.Add("Add1", x, y)
+        v3 = pybuda.op.Add("Add2", v1, v2)
+        return v3
+
+
+class ModelOpSrcFromTmEdge1(PyBudaModule):
+
+    model_name = "model_op_src_from_tm_edge1"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge1")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge1"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        xx = pybuda.op.Add("Add0", x, y)
+        yy = pybuda.op.tm.Transpose("Transpose0", xx, -1, -2)
+        output = self.operator(self.opname + "1", yy, yy)
+        return output
+
+
+class ModelOpSrcFromTmEdge2(PyBudaModule):
+
+    model_name = "model_op_src_from_tm_edge2"
+
+    def __init__(self, operator, opname, shape):
+        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge2")
+        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge2"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+
+    def forward(self, x, y):
+        # 
+        xx = pybuda.op.tm.Transpose("Transpose0", x, -1, -2)
+        yy = pybuda.op.tm.Transpose("Transpose1", y, -1, -2)
+        output = self.operator(self.opname + "2", xx, yy)
+        return output
 
 
 def verify(
     test_device: TestDevice,
-    input_model: str,
+    model_type: Type[PyBudaModule],
     input_operator: str,
     input_shape: TensorShape,
     number_of_operands: int,
@@ -92,8 +239,8 @@ def verify(
 ):
     '''Common verification function for all tests'''
 
-    architecture = f'test_plan.{input_model}.BudaElementWiseBinaryTest(operator=pybuda.op.{input_operator}, opname="{input_operator}", shape={input_shape})'
-    model = eval(architecture)
+    operator = getattr(pybuda.op, input_operator)
+    model = model_type(operator=operator, opname=input_operator, shape=input_shape)
 
     input_shapes = tuple([input_shape for _ in range(number_of_operands)])
     logger.trace(f"***input_shapes: {input_shapes}")
@@ -110,6 +257,17 @@ def verify(
     VerifyUtils.verify(model, test_device, input_shapes, input_params)
 
 
+MODEL_TYPES = [
+    ModelFromAnotherOp,
+    ModelFromHost,
+    ModelFromDramQueue,
+    # ModelFromDramQueuePrologued,
+    ModelConstEvalPass,
+    ModelOpSrcFromTmEdge1,
+    ModelOpSrcFromTmEdge2,
+]
+
+
 def get_eltwise_binary_ops():
     return [
         "Add",              #00
@@ -233,13 +391,11 @@ def get_input_shapes():
 
 
 @pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
-@pytest.mark.parametrize("input_model", 
-    [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if item.startswith("model") and not item.__contains__("prologued")]
-)
+@pytest.mark.parametrize("model_type", MODEL_TYPES)
 @pytest.mark.parametrize("input_shape", get_input_shapes())
 def test_eltwise_binary_ops_per_test_plan(
     input_operator,
-    input_model,
+    model_type,
     input_shape,
     test_device,
     dev_data_format=None, 
@@ -249,23 +405,23 @@ def test_eltwise_binary_ops_per_test_plan(
     
     # Observed Bugs: --------------------------------------------------------------------------------------------------------------------
     # 1. input_shape in ((1, 1000, 100), (10, 1000, 100)):
-    if input_model == "model_op_src_from_tm_edge1" and input_operator == "Heaviside" and input_shape in (s[30], s[43]):
+    if model_type == ModelOpSrcFromTmEdge1 and input_operator == "Heaviside" and input_shape in (s[30], s[43]):
         pytest.xfail(reason="RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/policies/policy_utils.cpp:2221: " + 
                             "graph ->get_edges( graph->get_node_by_name(nopInsertInst->src), " +
                             "graph->get_node_by_name(nopInsertInst->dest)) .size() == 1")
     # 2. input_shape in ((1, 9920, 1), (1, 1, 9920, 1), (9, 1, 9920, 1)):
-    if input_model == "model_op_src_from_another_op" and input_operator in ["Equal", "NotEqual"] and input_shape in (s[32], s[56], s[69]):
+    if model_type == ModelFromAnotherOp and input_operator in ["Equal", "NotEqual"] and input_shape in (s[32], s[56], s[69]):
         pytest.xfail(reason="RuntimeError: Fatal balancer error: Could not reconcile constraints: path[Add0 -> _fused_op_0]")
     # ------------------------------------------------------------------------------------------------------------------------------------
 
 
     input_source_flag = None
-    if input_model == "model_op_src_from_dram_queue":
+    if model_type == ModelFromDramQueue:
         input_source_flag = InputSourceFlags.FROM_DRAM
 
     verify(
         test_device=test_device,
-        input_model=input_model,
+        model_type=model_type,
         input_operator=input_operator,
         input_shape=input_shape,
         number_of_operands=2,
@@ -278,11 +434,11 @@ def test_eltwise_binary_ops_per_test_plan(
 
     file_path = VerifyUtils.get_netlist_filename()
 
-    if input_model == "model_op_src_from_dram_queue":
+    if model_type == ModelFromDramQueue:
         assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
         assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
 
-    if input_model == "model_op_src_const_eval_pass":
+    if model_type == ModelConstEvalPass:
         # Here we check there is no key with operator name in the netlist in graphs section
         d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
         for key in d.keys():
@@ -349,11 +505,11 @@ def get_input_shapes_prologued():
 
 
 @pytest.mark.parametrize("input_operator", get_eltwise_binary_ops_prologued())
-@pytest.mark.parametrize("input_model", ["model_op_src_from_dram_queue_prologued"])
+@pytest.mark.parametrize("model_type", [ModelFromDramQueuePrologued])
 @pytest.mark.parametrize("input_shape, input_source_flag, should_prolog", get_input_shapes_prologued())
 def test_eltwise_binary_ops_per_test_plan_dram_prologued(
     input_operator,
-    input_model,
+    model_type,
     input_shape,
     input_source_flag,
     should_prolog,
@@ -364,7 +520,7 @@ def test_eltwise_binary_ops_per_test_plan_dram_prologued(
 
     verify(
         test_device=test_device,
-        input_model=input_model,
+        model_type=model_type,
         input_operator=input_operator,
         input_shape=input_shape,
         number_of_operands=1,
@@ -412,13 +568,13 @@ def get_single_shape(microbatch_size=1):
 
 
 @pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
-@pytest.mark.parametrize("input_model", ["model_op_src_from_another_op"])
+@pytest.mark.parametrize("model_type", [ModelFromAnotherOp])
 @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_mf_eltwise_binary_ops_per_test_plan(input_operator, input_model, test_device, dev_data_format, math_fidelity):
+def test_mf_eltwise_binary_ops_per_test_plan(input_operator, model_type, test_device, dev_data_format, math_fidelity):
     test_eltwise_binary_ops_per_test_plan(
         input_operator,
-        input_model,
+        model_type,
         get_single_shape(),
         test_device,
         dev_data_format,
@@ -455,13 +611,13 @@ def test_mf_eltwise_binary_ops_per_test_plan(input_operator, input_model, test_d
 
 
 @pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
-@pytest.mark.parametrize("input_model", ["model_op_src_from_another_op"])
+@pytest.mark.parametrize("model_type", [ModelFromAnotherOp])
 @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_df_eltwise_binary_ops_per_test_plan(input_operator, input_model, test_device, dev_data_format, math_fidelity):
+def test_df_eltwise_binary_ops_per_test_plan(input_operator, model_type, test_device, dev_data_format, math_fidelity):
     test_eltwise_binary_ops_per_test_plan(
         input_operator,
-        input_model,
+        model_type,
         get_single_shape(),
         test_device,
         dev_data_format,

From 18c71eae03389419b4ab663e1dd23843928b6e3a Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 17 Jul 2024 09:56:26 +0000
Subject: [PATCH 044/116] Documenting RGG test commands

Documenting RGG test commands
Split pipeline jobs for PyBuda and PyTorch

Issue #2755

(cherry picked from commit a0435005ddb3054db3239b0fafe411111e89dbdf)
---
 pybuda/test/README.debug.md       |  2 +-
 pybuda/test/random/README.md      | 16 ++++++++--
 pybuda/test/random/test_graphs.py | 52 +++++++++++++++++--------------
 3 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index c2d735b6..efe83e80 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -1,5 +1,5 @@
 
-*Test specific environment variables that can be used to fine tune default behavior of PyBuda tests.*
+*Test specific environment variables that can be used to fine tune default behavior of PyBuda RGG tests.*
 
 ## Parameters
  * RANDOM\_TEST\_COUNT: Number of random tests to be generated and executed. The parameter generate test_index in range from 0 to RANDOM\_TEST\_COUNT-1. (default: 5)
diff --git a/pybuda/test/random/README.md b/pybuda/test/random/README.md
index 946f6e54..bc29e9b5 100644
--- a/pybuda/test/random/README.md
+++ b/pybuda/test/random/README.md
@@ -13,10 +13,18 @@ Source code of each randomly generated model with a pytest function can be autom
 
 ## Run
 
-Entrypoint for RGG pytests is `test_graphs.py` module
+Entrypoint for RGG pytests is in `test_graphs.py` module
 
-```bash
-pytest -svv pybuda/test/random/test_graphs.py`
+Example command for running PyBuda RGG tests generated via random graph algorithm
+
+```shell
+LOGURU_LEVEL=DEBUG RANDOM_TEST_COUNT=5 MIN_DIM=3 MAX_DIM=4 MIN_OP_SIZE_PER_DIM=16 MAX_OP_SIZE_PER_DIM=64 MIN_MICROBATCH_SIZE=1 MAX_MICROBATCH_SIZE=8 NUM_OF_NODES_MIN=5 NUM_OF_NODES_MAX=10 NUM_OF_FORK_JOINS_MAX=5 CONSTANT_INPUT_RATE=20 SAME_INPUTS_PERCENT_LIMIT=10 pytest -svv pybuda/test/random/test_graphs.py::test_random_graph_algorithm_pybuda
+```
+
+Example command for running PyTorch RGG tests generated via random graph algorithm
+
+```shell
+LOGURU_LEVEL=DEBUG RANDOM_TEST_COUNT=5 MIN_DIM=4 MAX_DIM=4 MIN_OP_SIZE_PER_DIM=4  MAX_OP_SIZE_PER_DIM=8  MIN_MICROBATCH_SIZE=1 MAX_MICROBATCH_SIZE=1 NUM_OF_NODES_MIN=3 NUM_OF_NODES_MAX=5  NUM_OF_FORK_JOINS_MAX=5 CONSTANT_INPUT_RATE=20 SAME_INPUTS_PERCENT_LIMIT=10 pytest -svv pybuda/test/random/test_graphs.py::test_random_graph_algorithm_pytorch
 ```
 
 ## Configuration
@@ -33,6 +41,8 @@ Parameters includes configuration of:
 
 For more details about configuration please take a look at `pybuda/test/random/rgg/config.py`.
 
+Please refer to full list of supported enviroment variables in [README.debug.md](../README.debug.md)
+
 ## Development
 
 Entrypoint for RGG impplementation is `process_test` module
diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index 251f3301..e802880c 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -132,18 +132,20 @@ def test_random_graph_algorithm_pybuda(test_index, random_seeds, test_device, ra
     randomizer_config = copy(randomizer_config)
     # randomizer_config.debug_shapes = True
     # randomizer_config.verify_shapes = True
-    randomizer_config.dim_min = 3
-    randomizer_config.dim_max = 4
-    randomizer_config.op_size_per_dim_min = 4
-    # randomizer_config.op_size_per_dim_min = 16
-    randomizer_config.op_size_per_dim_max = 8
-    # randomizer_config.op_size_per_dim_max = 64
-    # randomizer_config.op_size_per_dim_max = 256
-    randomizer_config.microbatch_size_min = 1
-    randomizer_config.microbatch_size_max = 8
-    randomizer_config.num_of_nodes_min = 5
-    randomizer_config.num_of_nodes_max = 10
-    randomizer_config.num_fork_joins_max = 5
+
+    # Uncomment the following randomizer_config values to override the default values
+    # randomizer_config.dim_min = 3
+    # randomizer_config.dim_max = 4
+    # randomizer_config.op_size_per_dim_min = 4
+    # # randomizer_config.op_size_per_dim_min = 16
+    # randomizer_config.op_size_per_dim_max = 8
+    # # randomizer_config.op_size_per_dim_max = 64
+    # # randomizer_config.op_size_per_dim_max = 256
+    # randomizer_config.microbatch_size_min = 1
+    # randomizer_config.microbatch_size_max = 8
+    # randomizer_config.num_of_nodes_min = 5
+    # randomizer_config.num_of_nodes_max = 10
+    # randomizer_config.num_fork_joins_max = 5
 
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
@@ -184,18 +186,20 @@ def test_random_graph_algorithm_pytorch(test_index, random_seeds, test_device, r
     randomizer_config = copy(randomizer_config)
     # randomizer_config.debug_shapes = True
     # randomizer_config.verify_shapes = True
-    randomizer_config.dim_min = 4
-    randomizer_config.dim_max = 4
-    randomizer_config.op_size_per_dim_min = 4
-    # randomizer_config.op_size_per_dim_min = 16
-    randomizer_config.op_size_per_dim_max = 8
-    # randomizer_config.op_size_per_dim_max = 64
-    # randomizer_config.op_size_per_dim_max = 256
-    randomizer_config.microbatch_size_min = 1
-    randomizer_config.microbatch_size_max = 8
-    randomizer_config.num_of_nodes_min = 3
-    randomizer_config.num_of_nodes_max = 5
-    randomizer_config.num_fork_joins_max = 5
+    
+    # Uncomment the following randomizer_config values to override the default values
+    # randomizer_config.dim_min = 4
+    # randomizer_config.dim_max = 4
+    # randomizer_config.op_size_per_dim_min = 4
+    # # randomizer_config.op_size_per_dim_min = 16
+    # randomizer_config.op_size_per_dim_max = 8
+    # # randomizer_config.op_size_per_dim_max = 64
+    # # randomizer_config.op_size_per_dim_max = 256
+    # randomizer_config.microbatch_size_min = 1
+    # randomizer_config.microbatch_size_max = 8
+    # randomizer_config.num_of_nodes_min = 3
+    # randomizer_config.num_of_nodes_max = 5
+    # randomizer_config.num_fork_joins_max = 5
 
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]

From e71f36986cbb82d4f28f34833cc56b39b042a581 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 17 Jul 2024 10:04:09 +0000
Subject: [PATCH 045/116] Debugging graph building errors

Store failing test code in case an exception occur during graph building like a validation error.
Split GraphNodeSetup.init_nodes in smaller function to enable reusing.
Skip importing DebugUtils if debug_shapes is turned off

Issue #2755

(cherry picked from commit 40d7a41b20e68eaa8137f12a88f914fbfb46537e)
---
 pybuda/test/random/rgg/algorithms.py          | 66 ++++++++++++-------
 pybuda/test/random/rgg/base.py                | 15 ++++-
 .../random/rgg/pybuda/generated_model.jinja2  |  4 +-
 .../random/rgg/pytorch/generated_model.jinja2 |  4 +-
 4 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 46d770f6..f72ad8ac 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -26,34 +26,16 @@ class GraphNodeSetup:
     always_unique_variables = False
 
     @classmethod
-    def init_nodes(cls, test_context: RandomizerTestContext):
+    def init_nodes_names(cls, test_context: RandomizerTestContext):
         """
-        Initializes the nodes of a graph. 
+        Initializes the nodes names of a graph. 
 
-        This method does three main things:
+        This method does following things:
         1. Sets the index for each node.
         2. Stores output values if they are needed as explicit input for a later operator.
-        3. Setting input nodes for open nodes.
-        4. Generates random settings for operator parameters.
-
-        Args:
-            test_context (RandomizerTestContext): The test context.
-
-        Raises:
-            Exception: If the number of inputs for a node does not match the configured input number.
-            Exception: If the node operator is not of type RandomizerOperator.
-
-        Returns:
-            None
         """
-        graph = test_context.graph
-        nodes = test_context.graph.nodes
 
-        rng_shape = test_context.rng_shape
-        rng_params = test_context.rng_params
-
-        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
-        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+        nodes = test_context.graph.nodes
 
         # Setting node.index
         op_index_cnt = 0
@@ -72,6 +54,25 @@ def init_nodes(cls, test_context: RandomizerTestContext):
                     input_node.out_value = input_node.operator_name
                     logger.trace(f"Set out_value = {input_node.out_value}")
 
+    @classmethod
+    def init_nodes_inputs(cls, test_context: RandomizerTestContext):
+        """
+        Setting input and contant nodes for open nodes.
+
+        Args:
+            test_context (RandomizerTestContext): The test context.
+
+        Returns:
+            None
+        """
+        graph = test_context.graph
+        nodes = test_context.graph.nodes
+
+        rng_shape = test_context.rng_shape
+
+        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
+        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+
         logger.trace("Setting input nodes for open nodes")
         open_nodes = NodeUtils.get_open_nodes(nodes)
         logger.trace(f"Open nodes {StrUtils.nodes_to_str(open_nodes)}")
@@ -133,6 +134,20 @@ def init_nodes(cls, test_context: RandomizerTestContext):
                 iconst_index += 1
                 constant_node.out_value = f"iconst{iconst_index}"
 
+    @classmethod
+    def init_nodes_params(cls, test_context: RandomizerTestContext):
+        """
+        Generates random parameters for each node.
+
+        Args:
+            test_context (RandomizerTestContext): The test context.
+
+        Returns:
+            None
+        """
+        nodes = test_context.graph.nodes
+        rng_params = test_context.rng_params
+
         logger.trace("Generating random settings for operator parameters")
         # Generate random values for operator parameters
         for node in nodes:
@@ -172,7 +187,9 @@ def prepare_graph(cls, test_context: RandomizerTestContext):
         graph = test_context.graph
 
         logger.trace("Initializing nodes")
-        cls.init_nodes(test_context)
+        cls.init_nodes_names(test_context)
+        cls.init_nodes_inputs(test_context)
+        cls.init_nodes_params(test_context)
         logger.trace("Nodes initialized")
 
         logger.trace("Validating graph")
@@ -196,7 +213,8 @@ def __init__(self, framework: Framework, randomizer_config):
     def _get_random_operator(self, rng):
         return rng.choice(self.operators)
 
-    def _init_default_constructor_params(self, node: RandomizerNode):
+    @classmethod
+    def _init_default_constructor_params(cls, node: RandomizerNode):
         '''Initializing default constructor parameters based on input and output shapes'''
         # Operator specific settings
         # TODO abstract this
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index c1f8a841..5cec806a 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -243,7 +243,19 @@ def run(self, graph_builder: GraphBuilder):
         # build random graph for the specified parameters
         logger.trace("Building graph started")
         graph_duration = Timer()
-        self.build_graph(graph_builder)
+        try:
+            self.build_graph(graph_builder)
+        except Exception as e1:
+            # Try to save test source code to file for debugging purposes if an error occurs
+            try:
+                test_code_str = self.generate_code()
+                if randomizer_config.save_tests:
+                    # Saving test source code to file for debugging purposes
+                    self.save_test(test_code_str, failing_test=True)
+            except Exception as e2:
+                logger.error(f"Error while saving test: {e2}")
+            # Re-raise the original exception from graph building
+            raise e1
         logger.trace("Building graph completed")
         graph = self.test_context.graph
         logger.debug(f"Generating graph model {GraphUtils.short_description(graph)}")
@@ -288,6 +300,7 @@ def process_test(test_name: str, test_index: int, random_seed: int, test_device:
     Process a single randomizer test.
 
     Args:
+        test_name (str): The name of the test used for generating test code, test file name, etc.
         test_index (int): The index of the test.
         random_seed (int): The random seed for the test.
         test_device (TestDevice): The device for the test.
diff --git a/pybuda/test/random/rgg/pybuda/generated_model.jinja2 b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
index b362765f..6b91c6ed 100644
--- a/pybuda/test/random/rgg/pybuda/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pybuda/generated_model.jinja2
@@ -3,8 +3,8 @@ import pybuda
 {% if test_format %}
 import pytest
 from pybuda.verify import verify_module, VerifyConfig
-{% endif %}
-from test.random.rgg import DebugUtils
+{% endif %}{% if randomizer_config.debug_shapes %}
+from test.random.rgg import DebugUtils{% endif %}
 from pybuda import PyBudaModule, Tensor
 
 {# TODO replace empty new lines with spaces to keep formatting in pipeline #}
diff --git a/pybuda/test/random/rgg/pytorch/generated_model.jinja2 b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
index dcbfabdb..81317695 100644
--- a/pybuda/test/random/rgg/pytorch/generated_model.jinja2
+++ b/pybuda/test/random/rgg/pytorch/generated_model.jinja2
@@ -3,8 +3,8 @@ import pybuda
 {% if test_format %}
 import pytest
 from pybuda.verify import verify_module, VerifyConfig
-{% endif %}
-from test.random.rgg import DebugUtils
+{% endif %}{% if randomizer_config.debug_shapes %}
+from test.random.rgg import DebugUtils{% endif %}
 
 {# TODO replace empty new lines with spaces to keep formatting in pipeline #}
 class GeneratedTestModel_{{ test_index }}_{{ random_seed }}(torch.nn.Module):

From 93793b2f10dfa7791bd82bee25e8cfa132699e69 Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Tue, 23 Jul 2024 09:49:13 +0000
Subject: [PATCH 046/116] [Blackhole] Don't use top 16MB of dram channels when
 allocating queues

(cherry picked from commit 9b1705013a058bb57fd7573a1246c7beab7e5b68)
---
 pybuda/csrc/lower_to_buda/queue.hpp       |  4 +--
 pybuda/csrc/placer/best_fit_allocator.cpp | 20 +++++++-------
 pybuda/csrc/placer/best_fit_allocator.hpp | 11 ++++----
 pybuda/csrc/placer/dram.hpp               |  3 +--
 pybuda/csrc/placer/dram_allocator.cpp     | 33 ++++++++++++++---------
 pybuda/csrc/placer/dram_allocator.hpp     | 20 +++++++-------
 pybuda/csrc/placer/placer.hpp             |  8 +++---
 7 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/pybuda/csrc/lower_to_buda/queue.hpp b/pybuda/csrc/lower_to_buda/queue.hpp
index 9ec65cad..0fc6be7e 100644
--- a/pybuda/csrc/lower_to_buda/queue.hpp
+++ b/pybuda/csrc/lower_to_buda/queue.hpp
@@ -21,12 +21,12 @@ enum BudaQueueLocation { DRAM, HOST };
 
 struct BudaQueueDramLoc {
     std::uint32_t dram_channel;
-    std::uint32_t dram_address;
+    std::size_t dram_address;
 };
 
 struct BudaQueueHostLoc {
     std::uint32_t host_channel;
-    std::uint32_t host_address;
+    std::size_t host_address;
 };
 
 struct BudaQueue {
diff --git a/pybuda/csrc/placer/best_fit_allocator.cpp b/pybuda/csrc/placer/best_fit_allocator.cpp
index f2644fa1..01e1b4c0 100644
--- a/pybuda/csrc/placer/best_fit_allocator.cpp
+++ b/pybuda/csrc/placer/best_fit_allocator.cpp
@@ -2,10 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "placer/best_fit_allocator.hpp"
+#include <cstdint>
 
 namespace tt::placer {
 
-BestFitAllocator::BestFitAllocator(std::uint32_t start_addr, std::uint32_t end_addr, Blocks pre_allocated_blocks) : ChannelAllocator()
+BestFitAllocator::BestFitAllocator(std::size_t start_addr, std::size_t end_addr, Blocks pre_allocated_blocks) : ChannelAllocator()
 {
     if (pre_allocated_blocks.free_blocks_start.size() > 0) {
         blocks = pre_allocated_blocks;
@@ -21,9 +22,9 @@ void BestFitAllocator::add_free_block(const Block &block)
     blocks.free_blocks_end[block.addr + block.size] = block;
 }
 
-std::uint32_t BestFitAllocator::get_capacity()
+std::size_t BestFitAllocator::get_capacity()
 {
-    std::uint32_t capacity = 0;
+    std::size_t capacity = 0;
     for (auto free_block : blocks.free_blocks_start) {
         capacity += free_block.second.size;
     }
@@ -32,16 +33,16 @@ std::uint32_t BestFitAllocator::get_capacity()
 
 void BestFitAllocator::remove_free_block(const Block &block) 
 {
-    std::uint32_t end = block.addr + block.size;
+    std::size_t end = block.addr + block.size;
     blocks.free_blocks_start.erase(block.addr);
     blocks.free_blocks_end.erase(end);
 }
 
-bool BestFitAllocator::allocate(std::uint32_t size, std::uint32_t &addr)
+bool BestFitAllocator::allocate(std::size_t size, std::size_t &addr)
 {
     // Find the free block with the closest >= size
     Block closest_block;
-    std::uint32_t diff = UINT32_MAX;
+    std::size_t diff = SIZE_MAX;
     for (auto it = blocks.free_blocks_start.rbegin(); it != blocks.free_blocks_start.rend(); it++) 
     {
         if (it->second.size >= size) 
@@ -56,10 +57,9 @@ bool BestFitAllocator::allocate(std::uint32_t size, std::uint32_t &addr)
         }
     }
 
-    if (diff == UINT32_MAX)
+    if (diff == SIZE_MAX)
         return false;
 
-    addr = closest_block.addr;
     // Since we allocate new block from right to left, end of the free block will be the end of our new allocated block
     addr = closest_block.addr + closest_block.size - size;
     remove_free_block(closest_block);
@@ -73,7 +73,7 @@ bool BestFitAllocator::allocate(std::uint32_t size, std::uint32_t &addr)
     return true;
 }
 
-void BestFitAllocator::deallocate(std::uint32_t addr) 
+void BestFitAllocator::deallocate(std::size_t addr) 
 {
     //return;
     auto it = blocks.allocated_blocks.find(addr);
@@ -107,7 +107,7 @@ void BestFitAllocator::clear_allocated_blocks()
     Blocks blocks = get_blocks();
     for (const auto& address_block_pair : blocks.allocated_blocks)
     {
-        std::uint32_t start_address = address_block_pair.first;
+        std::size_t start_address = address_block_pair.first;
         deallocate(start_address);
     }
 }
diff --git a/pybuda/csrc/placer/best_fit_allocator.hpp b/pybuda/csrc/placer/best_fit_allocator.hpp
index 6c6ab4cd..4e9a1df2 100644
--- a/pybuda/csrc/placer/best_fit_allocator.hpp
+++ b/pybuda/csrc/placer/best_fit_allocator.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "placer/dram_allocator.hpp"
-#include "placer/dram.hpp"
 
 namespace tt::placer {
 
@@ -15,10 +14,10 @@ class BestFitAllocator : public ChannelAllocator
     void remove_free_block(const Block &block);
 public:
     virtual Blocks get_blocks() override { return blocks; }
-    BestFitAllocator(std::uint32_t start_addr, std::uint32_t end_addr, Blocks pre_allocated_blocks = Blocks());
-    virtual bool allocate(std::uint32_t size, std::uint32_t &addr) override; // return true if allocated, and update addr
-    virtual void deallocate(std::uint32_t addr) override;
-    virtual std::uint32_t get_capacity() override;
-    virtual void clear_allocated_blocks();
+    BestFitAllocator(std::size_t start_addr, std::size_t end_addr, Blocks pre_allocated_blocks = Blocks());
+    virtual bool allocate(std::size_t size, std::size_t &addr) override; // return true if allocated, and update addr
+    virtual void deallocate(std::size_t addr) override;
+    virtual std::size_t get_capacity() override;
+    virtual void clear_allocated_blocks() override;
 };
 }
diff --git a/pybuda/csrc/placer/dram.hpp b/pybuda/csrc/placer/dram.hpp
index e0b627ec..d3bb3d98 100644
--- a/pybuda/csrc/placer/dram.hpp
+++ b/pybuda/csrc/placer/dram.hpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
-#include <cassert>
 
 #include "backend_api/device_config.hpp"
 #include "balancer/balancer.hpp"
@@ -157,7 +156,7 @@ struct QueueDRAMPlacementParameters
     bool in_p2p_region_hard;
     bool is_input;
     bool is_prologue;
-    std::uint32_t queue_size;
+    std::size_t queue_size;
 };
 
 using DRAMScheduleData = std::pair<QueuePlacement, QueueDRAMPlacementParameters>;
diff --git a/pybuda/csrc/placer/dram_allocator.cpp b/pybuda/csrc/placer/dram_allocator.cpp
index 3ec2fbc8..a7fcceca 100644
--- a/pybuda/csrc/placer/dram_allocator.cpp
+++ b/pybuda/csrc/placer/dram_allocator.cpp
@@ -184,7 +184,7 @@ DramAllocator::DramAllocator(
     std::unordered_set<std::uint32_t> allocated_channels;
     switch (allocator_algorithm)
     {
-        case BEST_FIT:
+        case BEST_FIT: {
             std::size_t p2p_offset;
             std::size_t p2p_size;
 
@@ -208,11 +208,20 @@ DramAllocator::DramAllocator(
             // address is p2p_offset - 1.
             channel_allocators.push_back(std::make_unique<BestFitAllocator>(
                 dram_config.dram_config[0].initial_dram_offset, p2p_offset - 1, allocated_blocks[0]));
+
+            std::size_t limit_top_address = 0;
+            if (dram_config.device_config.is_blackhole()) 
+            {
+                // The top 16MB (0xFF00_0000 - 0xFFFF_FFFF) of DRAM are not accessible through the NOC on blackhole.
+                //
+                limit_top_address = 16 * 1024 * 1024;
+            }
+
             if (dram_config.device_config.is_wormhole_b0())
             {
                 channel_allocators.push_back(std::make_unique<BestFitAllocator>(
                     std::max(p2p_offset + p2p_size, dram_config.dram_config[0].initial_dram_offset),
-                    dram_config.dram_config[0].channel_size - 1,
+                    dram_config.dram_config[0].channel_size - 1 - limit_top_address,
                     allocated_blocks[0]));
             }
             allocated_channels.insert(0);  // 0 is done
@@ -237,7 +246,7 @@ DramAllocator::DramAllocator(
                         allocated_blocks[2 * dram_config.dram_config[i].channel]));
                     channel_allocators.push_back(std::make_unique<BestFitAllocator>(
                         std::max(dram_config.dram_config[i].initial_dram_offset, dram_config.dram_config[i].channel_size / 2),
-                        dram_config.dram_config[i].channel_size - 1,
+                        dram_config.dram_config[i].channel_size - 1 - limit_top_address,
                         allocated_blocks[(2 * dram_config.dram_config[i].channel) + 1]));
                 }
                 else
@@ -251,7 +260,7 @@ DramAllocator::DramAllocator(
             p2p_allocator =
                 std::make_unique<BestFitAllocator>(p2p_offset, p2p_offset + p2p_size - 1, allocated_blocks.back());
             break;
-
+        }
         default: TT_THROW("Unknown placement algorithm");
     }
 }
@@ -265,10 +274,10 @@ std::vector<Blocks> DramAllocator::get_blocks()
 }
 
 // Gets dram free space, both in p2p region (managed by p2p_allocator) and in regular part of dram (managed by channel_allocators)
-std::pair<uint32_t, uint32_t> DramAllocator::get_dram_free_space()
+std::pair<std::size_t, size_t> DramAllocator::get_dram_free_space()
 {
-    uint32_t regular_free_space = 0;
-    uint32_t p2p_free_space = 0;
+    size_t regular_free_space = 0;
+    size_t p2p_free_space = 0;
     for (std::size_t i = 0; i < channel_allocators.size(); i++)
     {
         regular_free_space += channel_allocators.at(i)->get_capacity();
@@ -454,8 +463,8 @@ bool DramAllocator::allocate_queues(
 
 QueueBufferPlacement DramAllocator::create_buffer_placement(
     std::uint32_t virtual_channel,
-    std::uint32_t channel_address,
-    std::uint32_t buffer_size,
+    std::size_t channel_address,
+    std::size_t buffer_size,
     bool in_p2p_region)
 {
     std::uint32_t real_channel = virtual_channel;
@@ -478,8 +487,8 @@ QueueBufferPlacement DramAllocator::create_buffer_placement(
 
 std::pair<bool, std::vector<QueueBufferPlacement>> DramAllocator::allocate_buffers(const QueueDRAMPlacementParameters &parameters)
 {
-    const std::uint32_t num_channels = channel_allocators.size();
-    const std::uint32_t buffer_size = parameters.queue_size;
+    const std::size_t num_channels = channel_allocators.size();
+    const std::size_t buffer_size = parameters.queue_size;
     TT_ASSERT(buffer_size > 0, "Buffer size for queue {} must be larger than 0", parameters.node->name());
 
     std::vector<QueueBufferPlacement> buffer_placement;
@@ -487,7 +496,7 @@ std::pair<bool, std::vector<QueueBufferPlacement>> DramAllocator::allocate_buffe
     {
         for (std::uint32_t col = 0; col < parameters.grid_shape.columns; col++)
         {
-            std::uint32_t allocated_address;
+            std::size_t allocated_address;
 
             if (parameters.in_p2p_region_soft or parameters.in_p2p_region_hard) 
             {
diff --git a/pybuda/csrc/placer/dram_allocator.hpp b/pybuda/csrc/placer/dram_allocator.hpp
index bda21465..0edff2fc 100644
--- a/pybuda/csrc/placer/dram_allocator.hpp
+++ b/pybuda/csrc/placer/dram_allocator.hpp
@@ -15,13 +15,13 @@ namespace placer
 {
 struct Block
 {
-    std::uint32_t addr, size;
+    std::size_t addr, size;
 };
 struct Blocks
 {
-    std::map<std::uint32_t, Block> free_blocks_start;  // keyed on start addr
-    std::unordered_map<std::uint32_t, Block> free_blocks_end;    // keyed on start+size
-    std::unordered_map<std::uint32_t, Block> allocated_blocks;   // keyed on start
+    std::map<std::size_t, Block> free_blocks_start;  // keyed on start addr
+    std::unordered_map<std::size_t, Block> free_blocks_end;    // keyed on start+size
+    std::unordered_map<std::size_t, Block> allocated_blocks;   // keyed on start
 };
 // Allocate buffers within one channel
 class ChannelAllocator
@@ -29,9 +29,9 @@ class ChannelAllocator
    public:
     ChannelAllocator() {}
     virtual ~ChannelAllocator() = default;
-    virtual bool allocate(std::uint32_t size, std::uint32_t &addr) = 0;  // return true if allocated, and update addr
-    virtual void deallocate(std::uint32_t addr) = 0;
-    virtual std::uint32_t get_capacity() = 0;
+    virtual bool allocate(std::size_t size, std::size_t &addr) = 0;  // return true if allocated, and update addr
+    virtual void deallocate(std::size_t addr) = 0;
+    virtual std::size_t get_capacity() = 0;
     virtual Blocks get_blocks() = 0;
     virtual void clear_allocated_blocks() = 0;
 };
@@ -75,8 +75,8 @@ class DramAllocator
 
     QueueBufferPlacement create_buffer_placement(
         std::uint32_t virtual_channel,
-        std::uint32_t address,
-        std::uint32_t block_size,
+        std::size_t address,
+        std::size_t buffer_size,
         bool allocated_in_p2p_region);
     std::pair<bool, std::vector<QueueBufferPlacement>> allocate_buffers(const QueueDRAMPlacementParameters &parameters);
     void deallocate_buffers(const QueuePlacement& queue_placement, const QueueDRAMPlacementParameters& placement_parameters);
@@ -92,7 +92,7 @@ class DramAllocator
     bool allocate_queues(std::vector<DRAMScheduleData> &scheduled_queue_placements, bool disable_dynamic_dram, int microbatch_size);
     void reset_dram_allocator();
     std::vector<Blocks> get_blocks();
-    std::pair<uint32_t, uint32_t> get_dram_free_space();
+    std::pair<std::size_t, std::size_t> get_dram_free_space();
     std::uint32_t get_num_of_channels() { return channel_allocators.size(); };
 };
 
diff --git a/pybuda/csrc/placer/placer.hpp b/pybuda/csrc/placer/placer.hpp
index 566a0481..06b56df0 100644
--- a/pybuda/csrc/placer/placer.hpp
+++ b/pybuda/csrc/placer/placer.hpp
@@ -289,11 +289,11 @@ struct OpPlacement
 struct QueueBufferPlacement
 {
     uint32_t dram_channel;
-    uint32_t dram_address;
+    size_t dram_address;
 
     // Not strictly needed to set placement, but convenient to have here
     Coord dram_channel_location;
-    uint32_t buffer_size;
+    size_t buffer_size;
     bool allocated_in_p2p_region;
 
     // methods
@@ -303,8 +303,8 @@ struct QueueBufferPlacement
 struct QueueHostBufferPlacement
 {
     uint32_t channel;
-    uint32_t address;
-    uint32_t buffer_size;
+    size_t address;
+    size_t buffer_size;
 
     // methods
     json to_json() const;

From 0f43c183d296bb76dbcb8c8a60325fc96404510a Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Wed, 24 Jul 2024 09:39:20 -0400
Subject: [PATCH 047/116] [TVM] Decompose repeat_interleave pytorch op and add
 sanity test

(cherry picked from commit a4ff19a91355338f49d068d9e64302c52c258825)
---
 .../sanity/tests_B/test_pattern_matcher.py    | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
index 0a987176..39a566c0 100644
--- a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
+++ b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
@@ -193,3 +193,57 @@ def forward(self,input):
             verify_tvm_compile=True,
         )
     )
+
+@pytest.mark.parametrize("input_shape", ((1, 4), (1, 4, 3), (1, 2, 7, 6)))
+@pytest.mark.parametrize("repeat_dims", (1, 2, 3, -1, -2, -3))
+@pytest.mark.parametrize("num_repeats", (2, 3))
+def test_repeat_interleave_pytorch(test_device, input_shape, repeat_dims, num_repeats):
+
+    dims = repeat_dims
+    if dims < 0:
+        dims = len(input_shape) + dims
+        if dims < 0:
+            pytest.skip()
+
+    if dims > int(len(input_shape) - 1) or input_shape[dims] == 1:
+        pytest.skip()
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    class Repeat_interleave_model(torch.nn.Module):
+        def __init__(self, repeats, dims):
+            super().__init__()
+            self.repeats = repeats
+            self.dims = dims
+
+        def forward(self, input_tensor):
+            return torch.repeat_interleave(input_tensor, repeats = self.repeats, dim = self.dims)
+
+    model = Repeat_interleave_model(repeats=num_repeats, dims=repeat_dims)
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(
+        "pt_repeat_interleave", model
+    )
+
+    input_sample = torch.rand(input_shape)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(input_sample.shape,)],
+        inputs=[(input_sample,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+        ),
+    )
+

From a4843f7447595c745e3d5842b6e411879f3ca24e Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Wed, 24 Jul 2024 14:24:42 +0000
Subject: [PATCH 048/116] Update BBE submodule to week29

---
 third_party/budabackend | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/budabackend b/third_party/budabackend
index 44ae26b5..e3af07f5 160000
--- a/third_party/budabackend
+++ b/third_party/budabackend
@@ -1 +1 @@
-Subproject commit 44ae26b5edad8acf7dd5b4335f716ddc617ee8c4
+Subproject commit e3af07f5059d026a1c2d839254197fda7bad15ef

From c79c1b4f7d55635559c6ac3bd98717fa2f7df378 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Wed, 24 Jul 2024 14:27:52 +0000
Subject: [PATCH 049/116] Update tvm and demos subomdules

---
 third_party/buda-model-demos | 2 +-
 third_party/tvm              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/buda-model-demos b/third_party/buda-model-demos
index a6739e0e..e3c6976f 160000
--- a/third_party/buda-model-demos
+++ b/third_party/buda-model-demos
@@ -1 +1 @@
-Subproject commit a6739e0ef00565c4b5c4ee2d8251c9f53428b888
+Subproject commit e3c6976f4392b5e06f4e4c041a851d800fdcb353
diff --git a/third_party/tvm b/third_party/tvm
index 6b61b3e8..c748d645 160000
--- a/third_party/tvm
+++ b/third_party/tvm
@@ -1 +1 @@
-Subproject commit 6b61b3e805f94ab16393fe35edaa4e650089933c
+Subproject commit c748d64552cd86cc3406511366d96ae8ae866ecd

From 22404e849bac873e96f49e8e0f3339c77e82072b Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Tue, 23 Jul 2024 19:07:45 +0000
Subject: [PATCH 050/116] Add tests for ddrnet23-semantic
 segmentation(ONNX)-WHB0

(cherry picked from commit 0ca341a62e3e27e539c6efcb4754378b283af7cb)
---
 .../high_prio/cnn/onnx/test_ddrnet.py         | 99 +++++++------------
 1 file changed, 33 insertions(+), 66 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
index 8d2fd159..fa30bdec 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
@@ -37,9 +37,7 @@ def test_ddrnet(variant, test_device):
     # STEP 2: # Create PyBuda module from onnx weights
     model_name = f"{variant}_onnx"
 
-    load_path = (
-        f"third_party/confidential_customer_models/internal/ddrnet/files/onnx/{variant}.onnx"
-    )
+    load_path = f"third_party/confidential_customer_models/internal/ddrnet/files/onnx/{variant}.onnx"
 
     model = onnx.load(load_path)
     tt_model = pybuda.OnnxModule(model_name, model, load_path)
@@ -68,17 +66,12 @@ def test_ddrnet(variant, test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            pcc=(
-                0.98
-                if test_device.arch == BackendDevice.Grayskull
-                and variant != "ddrnet23s"
-                else 0.99
-            ),
+            pcc=(0.98 if test_device.arch == BackendDevice.Grayskull and variant != "ddrnet23s" else 0.99),
         ),
     )
 
 
-variants = ["ddrnet_23_slim_1024"]
+variants = ["ddrnet_23_slim_1024", "ddrnet23_cityscapes"]
 
 
 @pytest.mark.parametrize("variant", variants)
@@ -90,63 +83,36 @@ def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
 
     if test_device.arch == BackendDevice.Wormhole_B0:
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "36864"
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone925.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11803.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11809.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11986.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 16),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11980.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11872.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11866.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 8),
-        )
-
-    if test_device.arch == BackendDevice.Grayskull:
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 32),
-        )
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "24576"
-        compiler_cfg.balancer_op_override(
-            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11915.dc.sparse_matmul.4.lc2",
-            "t_stream_shape",
-            (1, 32),
-        )
+        if variant == "ddrnet_23_slim_1024":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "36864"
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone925.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11803.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11809.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11986.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 16))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11980.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11872.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11866.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+
+        elif variant == "ddrnet23_cityscapes":
+            compiler_cfg.balancer_op_override("conv2d_213.dc.conv2d.5.dc.reshape.0_operand_commute_clone1044.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 16))
+            compiler_cfg.balancer_op_override("conv2d_213.dc.conv2d.5.dc.reshape.0_operand_commute_clone1050.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 16))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "153600"
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+    elif test_device.arch == BackendDevice.Grayskull:
+        if variant == "ddrnet_23_slim_1024":
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 32))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "24576"
+            compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11915.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 32))
 
     # Load and validate the model
-    load_path = f"third_party/confidential_customer_models/customer/model_0/files/cnn/ddrnet/{variant}.onnx"
+    if variant == "ddrnet_23_slim_1024":
+        load_path = f"third_party/confidential_customer_models/customer/model_0/files/cnn/ddrnet/{variant}.onnx"
+    else:
+        load_path = f"third_party/confidential_customer_models/internal/ddrnet/files/onnx/{variant}.onnx"
     model = onnx.load(load_path)
     onnx.checker.check_model(model)
     model_name = f"onnx_{variant}"
@@ -155,7 +121,8 @@ def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
     # Prepare input
     image_path = "third_party/confidential_customer_models/internal/ddrnet/files/samples/road_scenes.png"
     input_image = Image.open(image_path)
-    input_image = transforms.Resize((1024, 1024))(input_image)
+    if variant == "ddrnet_23_slim_1024":
+        input_image = transforms.Resize((1024, 1024))(input_image)
     input_tensor = transforms.ToTensor()(input_image)
     input_batch = input_tensor.unsqueeze(0)
 

From 91da2f4ff3cc790344cc96ad5487666253a11b03 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Thu, 25 Jul 2024 14:55:10 +0000
Subject: [PATCH 051/116] Bring up QDQ MobilenetV2 and Regnet_y

(cherry picked from commit 0862b99358a834f5aef5e9929f937b231d4f08e3)
---
 README.debug.md                               |   1 +
 pybuda/csrc/buda_passes.cpp                   |  14 +-
 pybuda/csrc/graph_lib/python_bindings.cpp     |   1 +
 pybuda/csrc/graph_lib/utils.cpp               |  74 +++++
 pybuda/csrc/graph_lib/utils.hpp               |   2 +
 pybuda/csrc/passes/commute_utils.cpp          | 261 ++++++++++--------
 pybuda/csrc/passes/consteval.cpp              |  20 +-
 .../csrc/passes/erase_consecutive_reshape.cpp |   4 +-
 pybuda/csrc/passes/erase_inverse_ops.cpp      |   2 +-
 .../csrc/passes/fork_quantization_scales.cpp  |  60 ++++
 .../csrc/passes/fork_quantization_scales.hpp  |  17 ++
 .../passes/fuse_redundant_tm_sequence.cpp     |   2 +-
 pybuda/csrc/passes/insert_inverse_on_io.cpp   |  10 +-
 ...nsert_inverse_outside_quantized_region.cpp | 148 +++++++++-
 pybuda/csrc/passes/make_quantized_ops.cpp     |  22 +-
 pybuda/csrc/passes/move_requantize.cpp        |   2 +-
 pybuda/csrc/passes/pre_lowering_passes.cpp    |   2 +-
 pybuda/csrc/passes/remove_quant_dequant.cpp   |  90 +++++-
 pybuda/pybuda/op/eval/buda/tm.py              |  10 +-
 pybuda/pybuda/op/eval/pybuda/convolution.py   |   3 +
 pybuda/pybuda/op/eval/pybuda/quantize.py      |   2 +-
 pybuda/pybuda/op/eval/pybuda/tm.py            |  10 +-
 pybuda/pybuda/op/eval/pybuda/transpose.py     |  11 +-
 ...py => test_onnx_quantized_mobilenet_v2.py} | 143 +++++-----
 .../test_onnx_quantized_mobilenet_v3.py       | 116 ++++++++
 .../quantized/test_onnx_quantized_regnet_y.py |  69 +++++
 .../quantized/test_onnx_quantized_resnet.py   |  86 +-----
 27 files changed, 850 insertions(+), 332 deletions(-)
 create mode 100644 pybuda/csrc/passes/fork_quantization_scales.cpp
 create mode 100644 pybuda/csrc/passes/fork_quantization_scales.hpp
 rename pybuda/test/quantized/{test_onnx_quantized_mobilenet.py => test_onnx_quantized_mobilenet_v2.py} (50%)
 create mode 100644 pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
 create mode 100644 pybuda/test/quantized/test_onnx_quantized_regnet_y.py

diff --git a/README.debug.md b/README.debug.md
index 80637dc0..c0171c12 100644
--- a/README.debug.md
+++ b/README.debug.md
@@ -119,6 +119,7 @@
  * PYBUDA\_VERSIM\_DEVICE\_ARCH: This env variable represents the architecture of the Versim device used in the pytest.
  * PYBUDA\_ENABLE\_EMULATION\_DEVICE: This device is a specific silicon emulation device that PyBUDA supports. The variable is used to enable emulation device in PyBUDA pytest environment. By setting this variable to 1, we are instructing PyBUDA to use the emulation device as the target device instead of the silicon or golden device. Enabling the emulation device can be useful for testing or experimentation purposes, allowing us to evaluate the behaviour of our code on this emulation device. In order to run emulation device as a targeted device, the source code must be built with EMULATION_DEVICE_EN=1 environment variable.
  * PYBUDA\_EMULATION\_DEVICE\_ARCH: This env variable represents the architecture of the emulation device used in the pytest.
+ * PYBUDA\_DISABLE\_DEPTHWISE\_CONV2D\_DECOMP: If set to 1, depthwise conv2d ops will not be decomposed using the depthwise op and instead use a matmul.
 
  ## Golden overrides
  * GOLDEN\_WORMHOLE\_B0: run Golden with Wormhole_B0 as target device instead of Grayskull (default)
diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 39cac9f2..64e971fe 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -53,6 +53,7 @@
 #include "passes/set_tile_dim.hpp"
 #include "passes/squeeze_to_reshape.hpp"
 #include "passes/t_stream.hpp"
+#include "passes/fork_quantization_scales.hpp"
 #include "perf_model/perf_model.hpp"
 #include "placer/dram.hpp"
 #include "placer/dram_allocator.hpp"
@@ -108,6 +109,7 @@ run_post_initial_graph_passes(graphlib::Graph *graph, py::object compiler_cfg_ob
         attempt_update |= passes::dequant_quant_to_requant(graph);
     }
     
+    passes::fork_quantization_scales(graph);
     passes::remove_quant_dequant(graph);
     reportify::dump_graph(graph->name(), "post_quantize_commute", graph);
     passes::decompose_nd_reshape_split(graph);
@@ -181,14 +183,12 @@ void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &d
     }
 
     // Move TMs outside of quantized graph regions
-    // attempt_update = true;
-    // while(attempt_update) {
-    //     passes::insert_inverse_outside_quantized_region(graph);
-    //     attempt_update = passes::erase_inverse_ops(graph);
-    // }
-    
+    attempt_update = true;
+    while(attempt_update) {
+        passes::insert_inverse_outside_quantized_region(graph);
+        attempt_update = passes::erase_inverse_ops(graph);
+    }
 
-    passes::move_tm_through_requantize(graph);
     recalculate_shapes(graph);
 
     passes::hoist_transforms_to_inputs(graph);
diff --git a/pybuda/csrc/graph_lib/python_bindings.cpp b/pybuda/csrc/graph_lib/python_bindings.cpp
index 95a1fbd5..fed5adbf 100644
--- a/pybuda/csrc/graph_lib/python_bindings.cpp
+++ b/pybuda/csrc/graph_lib/python_bindings.cpp
@@ -1001,6 +1001,7 @@ py::object consteval_input(
                 input_value = narrow_buda_tensor_to_pytorch(input_value, node->shape().as_vector());
             }
             node_outputs.insert({node->id(), {input_value}});
+            output = input_value;
             continue;
         }
 
diff --git a/pybuda/csrc/graph_lib/utils.cpp b/pybuda/csrc/graph_lib/utils.cpp
index df0b8d05..49c9903a 100644
--- a/pybuda/csrc/graph_lib/utils.cpp
+++ b/pybuda/csrc/graph_lib/utils.cpp
@@ -516,6 +516,80 @@ std::vector<Node *> topological_sort(const Graph &graph, std::function<bool(Node
     return result;
 }
 
+void fork_subgraph(Graph *graph, Node *node) {
+    TT_ASSERT(graph->data_users(node).size() > 1, "Node only has one user, do not fork.");
+
+    // If the node passed is an input node then just fork it
+    graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(node);
+    graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
+    if (input) {
+        input->get_consteval_graph(graph, true, true); // create graph before clone so input node name is correct
+        std::vector<graphlib::Edge> user_edges = graph->user_data_edges(input);
+        TT_ASSERT(graph->data_operands(input).size() == 0, "Input can't have operands");
+        for (int i = 1; i < (int)user_edges.size(); i++)
+        {
+            graphlib::Edge const &user_edge = user_edges[i];
+            log_trace(
+                LogConstEval,
+                "fork_subgraph: cloning: {} -> {}",
+                input->name(),
+                graph->node_by_id(user_edge.consumer_node_id)->name());
+            
+            std::string clone_name = input->name() + "_subgraph_fork_clone_" + std::to_string(user_edge.edge_creation_id);
+            Node *clone = graph->add_node(
+                input->clone(clone_name), 
+                graph->get_subgraph_id_for_node(input->id()));
+
+            auto attr = graph->get_edge_attributes(user_edge);
+            graph->remove_edge(user_edge);
+            // Replace user operand_edge
+            Edge new_user_edge = Edge(clone->id(), user_edge.producer_output_port_id, user_edge.consumer_node_id, user_edge.consumer_input_port_id, user_edge.edge_type);
+            
+            graph->add_edge(new_user_edge, attr);
+        }
+    }
+    else if (op) {
+        std::vector<Edge> user_edges = graph->user_data_edges(op);
+        std::vector<Edge> operand_edges = graph->operand_data_edges(op);
+
+        // Clone this op once for every user
+        for (int i = 1; i < (int)user_edges.size(); i++)
+        {
+            graphlib::Edge const &user_edge = user_edges[i];
+            log_trace(
+                LogConstEval,
+                "fork_subgraph: cloning: {} -> {}",
+                op->name(),
+                graph->node_by_id(user_edge.consumer_node_id)->name());
+
+            std::string clone_name = op->name() + "_subgraph_fork_clone_" + std::to_string(user_edge.edge_creation_id);
+            Node *clone_op = graph->add_node(op->clone(clone_name), graph->get_subgraph_id_for_node(op->id()));
+
+            // Copy all the operand edges
+            for (int j = 0; j < (int)operand_edges.size(); j++) {
+                Edge operand_edge = operand_edges[j];
+                Edge new_edge = Edge(operand_edge.producer_node_id, operand_edge.producer_output_port_id, clone_op->id(), operand_edge.consumer_input_port_id, operand_edge.edge_type);
+                graph->add_edge(new_edge, graph->get_edge_attributes(operand_edge));
+            }
+
+            // Replace user operand_edge
+            Edge new_user_edge = Edge(clone_op->id(), i, user_edge.consumer_node_id, user_edge.consumer_input_port_id, user_edge.edge_type);
+            
+            graph->add_edge(new_user_edge, graph->get_edge_attributes(user_edge));
+            graph->remove_edge(user_edge);
+        }
+
+        // Fork the graph of each operand
+        for (auto operand_edge : graph->operand_data_edges(op)) {
+            fork_subgraph(graph, graph->node_by_id(operand_edge.producer_node_id));
+        }
+
+    }
+    else {
+        TT_ASSERT(false, "The node passed must be an InputNode or OpNode");
+    }
+}
+
 std::vector<Node *> visible_nodes(Graph const &graph, std::function<bool(Node *)> node_filter)
 {
     std::vector<Node *> result;
diff --git a/pybuda/csrc/graph_lib/utils.hpp b/pybuda/csrc/graph_lib/utils.hpp
index 8eef3265..bf70b166 100644
--- a/pybuda/csrc/graph_lib/utils.hpp
+++ b/pybuda/csrc/graph_lib/utils.hpp
@@ -61,6 +61,8 @@ TileDim get_tile_dim_from_height_width(int tile_height, int tile_width);
 std::vector<Node *> topological_sort(
     Graph const &graph, std::function<bool(Node *)> node_filter = default_node_filter, bool unroll_loops = false);
 
+void fork_subgraph(Graph *graph, Node *node);
+
 std::vector<std::vector<Node *>> topological_generations(const Graph &graph);
 
 // Returns vector of all visible nodes in the graph.
diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index 338b0a90..69d6e067 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -209,7 +209,7 @@ bool are_compatible_ops(graphlib::Graph *graph, graphlib::OpNode *a, graphlib::O
     bool is_inverse = are_compatible_tms & (operand_shape == shape_to_check_on_b);
     auto operand_edges = graph->operand_data_edges(b);
     is_inverse |= are_inverse_with_broadcast(operand_shape, shape_to_check_on_b, total_broadcast_volume(graph, operand_edges[0])).first;
-    is_inverse &= not b->as<graphlib::TaggedNode>()->has_tag("dont_erase");
+    is_inverse &= not b->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false);
     if (not is_inverse)
         return false;
 
@@ -553,7 +553,7 @@ bool commute_through_reduce(
     std::pair<int, int> *operand_dims,
     graphlib::OpType *golden_transform,
     bool commute_up) 
-{
+{   
     TT_ASSERT(op->op_attrs().size() == 1);
     int reduce_dim = std::get<int>(op->op_attrs()[0]);
 
@@ -561,6 +561,7 @@ bool commute_through_reduce(
     if (reduce_dim < 0)
         reduce_dim += op->shape().size();
 
+    int original_reduce_dim = reduce_dim;
     // Check to see if this op has a user that is the same kind of reduce 
     bool can_commute = false;
 
@@ -574,114 +575,146 @@ bool commute_through_reduce(
         prev_nodes = op_users;
     }
 
-    for (graphlib::Node* next_node : next_nodes) {
-        graphlib::OpNode *next_op = dynamic_cast<graphlib::OpNode*>(next_node);
-        if (next_op == nullptr)
-            continue;
-
-        // Check if the next op is a reduce, and the same type of reduce
-        if (next_op->op_name() != op->op_name())
-            continue;
-
-        auto compare_shape = check_only ? graph->data_operands(op)[0]->shape() : *clone_shape;
+    if (not commute_up and initial_op->op_name() == "transpose") {
+        int dim0 = initial_op->op_type().get_attr_as<int>("dim0");
+        int dim1 = initial_op->op_type().get_attr_as<int>("dim1");
 
-        int next_reduce_dim = std::get<int>(next_op->op_attrs()[0]);
-        // Convert to positive indexing
-        if (next_reduce_dim < 0)
-            next_reduce_dim += next_op->shape().size();
+        if (dim0 < 0)
+            dim0 += initial_op->shape().size();
 
-        int min_reduce_dim = std::min(reduce_dim, next_reduce_dim);
-        int max_reduce_dim = std::max(reduce_dim, next_reduce_dim);
-        int commute_max_reduce_dim = max_reduce_dim - (op->shape().size() - commute_shape->size()); // Adjust for commute shape
+        if (dim1 < 0)
+            dim1 += initial_op->shape().size();
 
-        // This avoids the case where the reshape unflattens y into z. i.e (1, 1, 64, 4096) -> (1, 32, 2, 4096)
-        if (not commute_up)
+        if (dim0 == reduce_dim)
         {
-            if ((*commute_shape)[commute_max_reduce_dim] != compare_shape[min_reduce_dim] * compare_shape[max_reduce_dim]) {
-                TT_ASSERT(check_only, "Cannot perform commute if commute is not possible");
-                can_commute = false;
-                break;
-            }
+            reduce_dim = dim1;
         }
-        else {
-            if ((*commute_shape)[commute_max_reduce_dim] != 1) {
-                TT_ASSERT(check_only, "Cannot perform commute if commute is not possible");
-                can_commute = false;
-                break;
-            }
+        else if (dim1 == reduce_dim) {
+            reduce_dim = dim0;
         }
-        // If the next op is the same kind of reduce, and the reduce dim is one off, skip, next op we handle this case 
-        if (next_reduce_dim == reduce_dim+1 or next_reduce_dim == reduce_dim-1) {
+
+        if (reduce_dim < (int)op->shape().size()) {
             can_commute = true;
-            break;
+            auto reduce_shape = op->shape();
+            *clone_shape = reduce_shape;
+            reduce_shape[reduce_dim] = op->shape()[original_reduce_dim];
+            reduce_shape[original_reduce_dim] = op->shape()[reduce_dim];
+            *commute_shape = reduce_shape;
         }
+        
     }
+    else {
 
-    // Check to see if previous op is reduce
-    for(graphlib::Node* prev_node : prev_nodes)
-    {
-        graphlib::OpNode *prev_op = dynamic_cast<graphlib::OpNode*>(prev_node);
-        if (prev_op == nullptr)
-            continue;
+    
+        for (graphlib::Node* next_node : next_nodes) {
+            graphlib::OpNode *next_op = dynamic_cast<graphlib::OpNode*>(next_node);
+            if (next_op == nullptr)
+                continue;
 
-        if (prev_op->op_name() == op->op_name())
-        {
-            TT_ASSERT(prev_op->op_attrs().size() == 1);
-            int prev_reduce_dim = std::get<int>(prev_op->op_attrs()[0]);
+            // Check if the next op is a reduce, and the same type of reduce
+            if (next_op->op_name() != op->op_name())
+                continue;
+
+            auto compare_shape = check_only ? graph->data_operands(op)[0]->shape() : *clone_shape;
+
+            int next_reduce_dim = std::get<int>(next_op->op_attrs()[0]);
             // Convert to positive indexing
-            if (prev_reduce_dim < 0)
-                prev_reduce_dim += op->shape().size();
+            if (next_reduce_dim < 0)
+                next_reduce_dim += next_op->shape().size();
+
+            int min_reduce_dim = std::min(reduce_dim, next_reduce_dim);
+            int max_reduce_dim = std::max(reduce_dim, next_reduce_dim);
+            int commute_max_reduce_dim = max_reduce_dim - (op->shape().size() - commute_shape->size()); // Adjust for commute shape
 
-            // If the previous op is the same kind of reduce, and the reduce dim is one off, then we can determine the commute shape after both ops
-            if (prev_reduce_dim == reduce_dim+1 or prev_reduce_dim == reduce_dim-1)
+            // This avoids the case where the reshape unflattens y into z. i.e (1, 1, 64, 4096) -> (1, 32, 2, 4096)
+            if (not commute_up)
             {
-                auto commute_dim = (uint32_t) std::max(prev_reduce_dim, reduce_dim);
-                auto commute_vec = commute_shape->as_vector();
-                while (commute_dim >= commute_vec.size())  
-                    commute_vec.push_back(1);
-                *commute_shape = graphlib::Shape::create(commute_vec);
-                if (commute_up)
-                    (*commute_shape)[commute_dim] = producer->shape()[reduce_dim] * producer->shape()[prev_reduce_dim];
-                else
-                    (*commute_shape)[commute_dim] = 1;
-                if (clone_shape != nullptr) {
-                    (*clone_shape)[reduce_dim] = 1;
-                    (*clone_shape)[prev_reduce_dim] = 1;
+                if ((*commute_shape)[commute_max_reduce_dim] != compare_shape[min_reduce_dim] * compare_shape[max_reduce_dim]) {
+                    TT_ASSERT(check_only, "Cannot perform commute if commute is not possible");
+                    can_commute = false;
+                    break;
                 }
+            }
+            else {
+                if ((*commute_shape)[commute_max_reduce_dim] != 1) {
+                    TT_ASSERT(check_only, "Cannot perform commute if commute is not possible");
+                    can_commute = false;
+                    break;
+                }
+            }
+            // If the next op is the same kind of reduce, and the reduce dim is one off, skip, next op we handle this case 
+            if (next_reduce_dim == reduce_dim+1 or next_reduce_dim == reduce_dim-1) {
                 can_commute = true;
+                break;
             }
         }
-    }
-    
-    if (not can_commute)
-    {
-        auto [can_commute, new_dim] = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
-        if (can_commute)
+
+        // Check to see if previous op is reduce
+        for(graphlib::Node* prev_node : prev_nodes)
         {
-            graphlib::Shape updated_commute_shape = *commute_shape;
-            if (producer)
-            {
-                TT_ASSERT(commute_up, "Should only be using producer for shape if commuting up");
-                updated_commute_shape[new_dim] = producer->shape().as_vector()[reduce_dim];
-            }
-            else
+            graphlib::OpNode *prev_op = dynamic_cast<graphlib::OpNode*>(prev_node);
+            if (prev_op == nullptr)
+                continue;
+
+            if (prev_op->op_name() == op->op_name())
             {
-                updated_commute_shape[new_dim] = op->shape().as_vector()[reduce_dim];
+                TT_ASSERT(prev_op->op_attrs().size() == 1);
+                int prev_reduce_dim = std::get<int>(prev_op->op_attrs()[0]);
+                // Convert to positive indexing
+                if (prev_reduce_dim < 0)
+                    prev_reduce_dim += op->shape().size();
+
+                // If the previous op is the same kind of reduce, and the reduce dim is one off, then we can determine the commute shape after both ops
+                if (prev_reduce_dim == reduce_dim+1 or prev_reduce_dim == reduce_dim-1)
+                {
+                    auto commute_dim = (uint32_t) std::max(prev_reduce_dim, reduce_dim);
+                    auto commute_vec = commute_shape->as_vector();
+                    while (commute_dim >= commute_vec.size())  
+                        commute_vec.push_back(1);
+                    *commute_shape = graphlib::Shape::create(commute_vec);
+                    if (commute_up)
+                        (*commute_shape)[commute_dim] = producer->shape()[reduce_dim] * producer->shape()[prev_reduce_dim];
+                    else
+                        (*commute_shape)[commute_dim] = 1;
+                    if (clone_shape != nullptr) {
+                        (*clone_shape)[reduce_dim] = 1;
+                        (*clone_shape)[prev_reduce_dim] = 1;
+                    }
+                    can_commute = true;
+                }
             }
-            *commute_shape = updated_commute_shape;
-            if (clone_shape != nullptr)
+        }
+        
+        if (not can_commute)
+        {
+            auto [can_commute, new_dim] = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
+            if (can_commute)
             {
-                graphlib::Shape updated_clone_shape = *clone_shape;
+                graphlib::Shape updated_commute_shape = *commute_shape;
                 if (producer)
                 {
                     TT_ASSERT(commute_up, "Should only be using producer for shape if commuting up");
-                    updated_clone_shape[reduce_dim] = producer->shape().as_vector()[reduce_dim];
+                    updated_commute_shape[new_dim] = producer->shape().as_vector()[reduce_dim];
                 }
                 else
                 {
-                    updated_clone_shape[reduce_dim] = op->shape().as_vector()[reduce_dim];
+                    updated_commute_shape[new_dim] = op->shape().as_vector()[reduce_dim];
+                }
+                *commute_shape = updated_commute_shape;
+                if (clone_shape != nullptr)
+                {
+                    graphlib::Shape updated_clone_shape = *clone_shape;
+                    if (producer)
+                    {
+                        TT_ASSERT(commute_up, "Should only be using producer for shape if commuting up");
+                        updated_clone_shape[reduce_dim] = producer->shape().as_vector()[reduce_dim];
+                    }
+                    else
+                    {
+                        updated_clone_shape[reduce_dim] = op->shape().as_vector()[reduce_dim];
+                    }
+                    *clone_shape = updated_clone_shape;
                 }
-                *clone_shape = updated_clone_shape;
             }
         }
     }
@@ -696,6 +729,22 @@ bool commute_through_reduce(
     TT_ASSERT(next, "next must be set");
     TT_ASSERT(not commute_up, "Cannot perform commute upwards");
 
+    if (not commute_up and initial_op->op_name() == "transpose") {
+        auto op_attr = op->op_attrs();
+        auto reduce_shape = op->shape();
+
+        *clone_shape = reduce_shape;
+        reduce_shape[reduce_dim] = op->shape()[original_reduce_dim];
+        reduce_shape[original_reduce_dim] = op->shape()[reduce_dim];
+        *commute_shape = reduce_shape;
+
+        op_attr[0] = reduce_dim;
+        op->add_golden_transform(*golden_transform);
+        op->overwrite_op_attrs(op_attr);
+        op->set_shape(reduce_shape);
+        return true;
+    }
+
     if (graphlib::OpNode *next_as_op = dynamic_cast<graphlib::OpNode *>(next)) {
         if (op->op_name() == next_as_op->op_name()) {
             return true;
@@ -859,23 +908,31 @@ bool commute_through_quantization(
     bool commute_up)
 {
     TT_ASSERT(is_quantization_ops(op), "op must be an quantization op");
-    if (commute_up)
-        return false;
-
+    (void)commute_up; // Avoid compiler warning.
     int axis = std::get<int>(op->op_attrs()[1]);
     int new_axis = axis;
     bool can_commute = false;
 
     if (initial_op->op_type().op == "reshape") {
+        
+        // axis of quantization must have the same volume to the left and right of it
+        if (new_axis < 0)
+            new_axis += op->shape().size();
 
-        if (not commute_up) {
-            // axis of quantization must have the same volume to the left and right of it
-            
-            if (new_axis < 0)
-                new_axis += op->shape().size();
-
-            // check if axis moved to the right (or in the same place)
-            while (new_axis < (int)commute_shape->size()) {
+        // check if axis moved to the right (or in the same place)
+        while (new_axis < (int)commute_shape->size()) {
+            if ((*commute_shape)[new_axis] == op->shape()[axis]) {
+                if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
+                    and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
+                    can_commute = true;
+                }
+                break;
+            }
+            new_axis++;
+        }
+        if (not can_commute) {
+            new_axis = axis-1;
+            while (new_axis >= 0) {
                 if ((*commute_shape)[new_axis] == op->shape()[axis]) {
                     if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
                         and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
@@ -883,22 +940,9 @@ bool commute_through_quantization(
                     }
                     break;
                 }
-                new_axis++;
+                new_axis--;
             }
-            if (not can_commute) {
-                new_axis = axis-1;
-                while (new_axis >= 0) {
-                    if ((*commute_shape)[new_axis] == op->shape()[axis]) {
-                        if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
-                            and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
-                            can_commute = true;
-                        }
-                        break;
-                    }
-                    new_axis--;
-                }
-            }
-        }
+        } 
     } 
     else if (initial_op->op_type().op == "transpose") 
     {   
@@ -951,7 +995,8 @@ bool is_elementwise(graphlib::OpNode *op)
 
 bool is_quantization_ops(graphlib::OpNode *op)
 {
-    return op->op_name() == "buda_quantize" or op->op_name() == "buda_dequantize" or op->op_name() == "buda_requantize";
+    return op->op_name() == "buda_quantize" or op->op_name() == "buda_dequantize" or op->op_name() == "buda_requantize" 
+           or op->op_name() == "quantize" or op->op_name() == "dequantize" or op->op_name() == "requantize" ;
 }
 
 
diff --git a/pybuda/csrc/passes/consteval.cpp b/pybuda/csrc/passes/consteval.cpp
index b030293e..98908962 100644
--- a/pybuda/csrc/passes/consteval.cpp
+++ b/pybuda/csrc/passes/consteval.cpp
@@ -116,10 +116,22 @@ static std::vector<graphlib::Node *> consteval_input(graphlib::Graph *graph, gra
     graphlib::Node *user = users[0];
     log_debug(LogConstEval, "Promoting node - Graph: {} - Node: {}", input->name(), user->name());
 
-    std::vector<graphlib::Node *> removed_operands = graph->data_operands(user);
-    auto iter = std::find(removed_operands.begin(), removed_operands.end(), input);
-    TT_ASSERT(iter != removed_operands.end());
-    removed_operands.erase(iter);  // every operand except for `input` is removed in `promote_node`
+    std::vector<graphlib::Node *> user_other_operands = graph->data_operands(user);
+
+    auto iter = std::find(user_other_operands.begin(), user_other_operands.end(), input);
+    TT_ASSERT(iter != user_other_operands.end());
+    user_other_operands.erase(iter);  // every operand except for `input` is removed in `promote_node`
+
+    // The other user operands is essentially cloned if they themselves have multiple users.
+    // One clone will go in the consteval graph of the passed <*input>. And the other
+    // will remain in the main graph. We do not want to include it in removed_operands
+    // if this is the case as it wont have actually been removed from the graph.
+    std::vector<graphlib::Node *> removed_operands;
+    for (graphlib::Node * user_operand : user_other_operands) {
+        if (graph->data_users(user_operand).size() == 1)
+            removed_operands.push_back(user_operand);
+    }
+
 
     graphlib::ConstEvalGraph *consteval_graph = input->get_consteval_graph(graph, true, true);
     consteval_graph->promote_node(graph, user);
diff --git a/pybuda/csrc/passes/erase_consecutive_reshape.cpp b/pybuda/csrc/passes/erase_consecutive_reshape.cpp
index ae5f94a2..92c01a56 100644
--- a/pybuda/csrc/passes/erase_consecutive_reshape.cpp
+++ b/pybuda/csrc/passes/erase_consecutive_reshape.cpp
@@ -139,7 +139,7 @@ static void commute_eltwise_ops(graphlib::Graph *graph, std::vector<graphlib::No
                 auto name = last->name() + "_operand_commute_clone" + std::to_string(another_operand_edge.edge_creation_id);
                 graphlib::Node *clone = graph->add_node(last->clone(name), graph->get_subgraph_id_for_node(last->id()));
                 graphlib::OpNode *added_op = dynamic_cast<graphlib::OpNode *>(clone);
-                added_op->as<graphlib::TaggedNode>()->tag("dont_erase");
+                added_op->as<graphlib::TaggedNode>()->tag("dont_erase", true);
                 log_trace(LogGraphCompiler, "  Operand commute clone: {} -> between {} and {} ", name, added_op->name(), graph->node_by_id(another_operand_edge.producer_node_id)->name());
             
                 update_reshape_attr(added_op, commute_shape);
@@ -211,7 +211,7 @@ bool erase_consecutive_reshape(graphlib::Graph *graph, bool commute_eltwise)
             if (user_edges.size() > 1 or user_edges.empty())
                 continue;
 
-            if (node->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+            if (node->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                 continue;
 
             // TODO: relax this, but it causes a lot of edges cases
diff --git a/pybuda/csrc/passes/erase_inverse_ops.cpp b/pybuda/csrc/passes/erase_inverse_ops.cpp
index 731b0acb..ceaf9746 100644
--- a/pybuda/csrc/passes/erase_inverse_ops.cpp
+++ b/pybuda/csrc/passes/erase_inverse_ops.cpp
@@ -387,7 +387,7 @@ bool erase_inverse_ops(graphlib::Graph *graph)
             if (not op)
                 continue;
 
-            if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+            if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                 continue;
 
             if (match_fns.find(op->op_name()) == match_fns.end())
diff --git a/pybuda/csrc/passes/fork_quantization_scales.cpp b/pybuda/csrc/passes/fork_quantization_scales.cpp
new file mode 100644
index 00000000..4113e339
--- /dev/null
+++ b/pybuda/csrc/passes/fork_quantization_scales.cpp
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "passes/fork_quantization_scales.hpp"
+
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "passes/passes_utils.hpp"
+#include "passes/commute_utils.hpp"
+#include "reportify/reportify.hpp"
+
+namespace tt::passes {
+
+bool fork_quantization_scales(graphlib::Graph *graph) {
+    bool updated_anything = false;
+    bool attempt_update = true;
+    while(attempt_update) {
+        attempt_update = false;
+        for (auto *node : graphlib::topological_sort(*graph))
+        {
+            graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op)
+                continue;
+
+            if (not is_quantization_ops(op))
+                continue;
+
+            graphlib::Node *scale = graph->data_operands(op)[1];
+
+            std::vector<graphlib::Node *> ancestors_to_check{scale};
+
+            while (ancestors_to_check.size() > 0) {
+                scale = ancestors_to_check.back();
+                ancestors_to_check.pop_back();
+                if (graph->data_users(scale).size() > 1) {
+                    ancestors_to_check.clear();
+                    break;
+                }
+
+                for (graphlib::Node *operand : graph->data_operands(scale)) {
+                    ancestors_to_check.push_back(operand);
+                }
+            }
+
+            if (graph->data_users(scale).size() > 1) {
+                fork_subgraph(graph, scale);
+                attempt_update = true;
+                updated_anything = true;
+                break;
+            }
+            
+        }
+    }
+    return updated_anything;
+}
+
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/fork_quantization_scales.hpp b/pybuda/csrc/passes/fork_quantization_scales.hpp
new file mode 100644
index 00000000..cbfc01ea
--- /dev/null
+++ b/pybuda/csrc/passes/fork_quantization_scales.hpp
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+class OpNode;
+class Shape;
+}
+
+namespace tt::passes
+{
+// Returns true if any inverse ops were erased
+bool fork_quantization_scales(graphlib::Graph *graph);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
index 83de8006..0171519d 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
@@ -195,7 +195,7 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                 if (not op)
                     continue;
 
-                if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+                if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                     continue;
 
                 if (not is_tm(op->op_type()).cast<bool>())
diff --git a/pybuda/csrc/passes/insert_inverse_on_io.cpp b/pybuda/csrc/passes/insert_inverse_on_io.cpp
index ff4fd84e..bada13cd 100644
--- a/pybuda/csrc/passes/insert_inverse_on_io.cpp
+++ b/pybuda/csrc/passes/insert_inverse_on_io.cpp
@@ -32,7 +32,7 @@ void add_inverse_to_input_edges(
         auto name = initial_op->name() + "_input_commute_clone" + std::to_string(edge.edge_creation_id);
         auto *clone_0 = graph->add_node(initial_op->clone(name), graph->get_subgraph_id_for_node(initial_op->id()));
         graphlib::OpNode *clone_0_op = dynamic_cast<graphlib::OpNode *>(clone_0);
-        clone_0_op->as<graphlib::TaggedNode>()->tag("dont_erase");
+        clone_0_op->as<graphlib::TaggedNode>()->tag("dont_erase", true);
         update_reshape_attr(clone_0_op, frist_reshape);
         clone_0->set_shape(frist_reshape);
         log_trace(LogGraphCompiler, "  Input commute clone 0: {} set to shape: {}", name, frist_reshape);
@@ -95,7 +95,7 @@ void add_inverse_to_output_edge(
     name = initial_op->name() + "_output_commute_clone" + std::to_string(outgoing_edge.edge_creation_id);
     auto *clone_1 = graph->add_node(initial_op->clone(name), graph->get_subgraph_id_for_node(initial_op->id()));
     graphlib::OpNode *clone_1_op = dynamic_cast<graphlib::OpNode *>(clone_1);
-    clone_1_op->as<graphlib::TaggedNode>()->tag("dont_erase");
+    clone_1_op->as<graphlib::TaggedNode>()->tag("dont_erase", true);
     update_reshape_attr(clone_1_op, clone_shape);
     clone_1->set_shape(clone_shape);
     log_trace(LogGraphCompiler, "  Output commute clone 1: {}", name);
@@ -369,7 +369,7 @@ bool insert_inverse_on_inputs(graphlib::Graph *graph)
             if (not op)
                 continue;
 
-            if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+            if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                 continue;
 
             if (op->op_name() != "reshape" and op->op_name() != "transpose")
@@ -402,7 +402,7 @@ bool insert_inverse_on_outputs(graphlib::Graph *graph)
             if (not op)
                 continue;
 
-            if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+            if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                 continue;
 
             if (op->op_name() != "reshape" and op->op_name() != "transpose")
@@ -429,7 +429,7 @@ bool insert_inverse_on_downstream_tms(graphlib::Graph *graph) {
         if (not op)
             continue;
 
-        if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+        if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
             continue;
 
         if (op->op_name() != "reshape")
diff --git a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
index 349e4e0b..c3acf49c 100644
--- a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
+++ b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
@@ -30,8 +30,8 @@ bool is_op_in_quantized_region(graphlib::OpNode *op)
     return std::find(int_types.begin(), int_types.end(), op->output_df()) != int_types.end();
 }
 
-static std::vector<graphlib::Node *> find_downward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op) {
-    std::vector<graphlib::Node *> path;
+static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape> find_downward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op) {
+    std::vector<graphlib::Edge> users_outside;
 
     graphlib::OpNode *iter = initial_op;
 
@@ -45,12 +45,11 @@ static std::vector<graphlib::Node *> find_downward_path_out(graphlib::Graph *gra
 
         // For now if there are multiple children then dont commute
         std::vector<graphlib::Edge> user_edges = graph->user_data_edges(op);
-        if (user_edges.size() > 1)
+        if (user_edges.size() > 1 and op->op_name() != "buda_dequantize")
             break;
 
         graphlib::Edge user_edge = user_edges[0];
         
-        
         // For now, if there are any edge tms just dont commute
         if (op != initial_op) {
             std::vector<graphlib::OpType> tms = graph->get_edge_attributes(user_edge)->get_tms();
@@ -64,9 +63,12 @@ static std::vector<graphlib::Node *> find_downward_path_out(graphlib::Graph *gra
         if (not can_commute and op != initial_op) {
             break;
         }
-        path.push_back(op);
-        if (is_quantization_ops(op)) 
+
+        if (op->op_name() == "buda_dequantize") {
             found_dequantize = true;
+            for (graphlib::Edge user_edge : user_edges)
+                users_outside.push_back(user_edge);
+        }
 
         iter = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(user_edge.consumer_node_id));
         if (not iter)
@@ -74,12 +76,65 @@ static std::vector<graphlib::Node *> find_downward_path_out(graphlib::Graph *gra
     }
 
     if (not found_dequantize)
-        path.clear();
+        users_outside.clear();
 
-    return path;
+    return std::make_tuple(users_outside, commute_shape, clone_shape);
 }
 
-void insert_inverse_pair_below(graphlib::Graph *graph, graphlib::OpNode *transpose_op, std::vector<graphlib::Edge> edges) {
+static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape> find_upward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op) {
+    std::vector<graphlib::Edge> operands_outside;
+
+    graphlib::OpNode *iter = initial_op;
+
+    auto clone_shape = initial_op->shape();
+    auto commute_shape = shape_of_only_operand(graph, initial_op);
+
+    bool found_quantize = false;
+    while (not found_quantize) {
+        graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(iter);
+        TT_ASSERT(op);
+
+        // For now if there are multiple children then dont commute
+        std::vector<graphlib::Edge> operand_edges = graph->operand_data_edges(op);
+        if (operand_edges.size() > 1 and op->op_name() != "buda_quantize")
+            break;
+
+        graphlib::Edge operand_edge = operand_edges[0];
+        
+        // For now, if there are any edge tms just dont commute
+        if (op != initial_op) {
+            std::vector<graphlib::OpType> tms = graph->get_edge_attributes(operand_edge)->get_tms();
+            if (tms.size() > 0) {
+                break;
+            }
+        }
+
+        bool can_commute = can_commute_past_op(op, initial_op, graph, &commute_shape, &clone_shape, true);
+        if (not can_commute and op != initial_op) {
+            break;
+        }
+
+        if (op->op_name() == "buda_quantize") {
+            found_quantize = true;
+            for (graphlib::Edge operand_edge : operand_edges) {
+                // If the operand of this edge is already an inverse to this op, dont bother returning the edge
+                graphlib::OpNode *operand = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(operand_edge.producer_node_id));
+                if (operand and not are_compatible_ops(graph, initial_op, operand, &commute_shape))
+                    operands_outside.push_back(operand_edge);
+            }
+        }
+        iter = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(operand_edge.producer_node_id));
+        if (not iter)
+            break;
+    }
+
+    if (not found_quantize)
+        operands_outside.clear();
+
+    return std::make_tuple(operands_outside, commute_shape, clone_shape);
+}
+
+void insert_inverse_transpose_pair(graphlib::Graph *graph, graphlib::OpNode *transpose_op, std::vector<graphlib::Edge> edges, bool below) {
 
     const graphlib::OpType orig_op_type = transpose_op->op_type();
 
@@ -96,6 +151,8 @@ void insert_inverse_pair_below(graphlib::Graph *graph, graphlib::OpNode *transpo
         clone_inverse_op->op_type().set_attr("z_dim_slice", orig_op_type.get_attr("z_dim_slice"));
         auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, edge, clone_inverse_op);
         clone_inverse_op->set_output_df_from_operands(graph);
+        if (not below)
+            clone_inverse_op->tag("dont_erase", true);
         graphlib::Shape clone_inverse_shape = operand->shape();
         clone_inverse_shape[orig_op_type.get_attr_as<int>("dim0")] = operand->shape()[orig_op_type.get_attr_as<int>("dim1")];
         clone_inverse_shape[orig_op_type.get_attr_as<int>("dim1")] = operand->shape()[orig_op_type.get_attr_as<int>("dim0")];
@@ -111,10 +168,50 @@ void insert_inverse_pair_below(graphlib::Graph *graph, graphlib::OpNode *transpo
         clone_op->set_output_df_from_operands(graph);
         graphlib::Shape clone_shape = operand->shape();
         clone_op->set_shape(clone_shape);
+        if (below)
+            clone_op->tag("dont_erase", "true");
     }
     
 }
 
+void insert_inverse_reshape_pair(graphlib::Graph *graph, graphlib::OpNode *reshape_op, std::vector<graphlib::Edge> edges, graphlib::Shape commute_shape, graphlib::Shape clone_shape, bool below) {
+    const graphlib::OpType orig_op_type = reshape_op->op_type();
+
+    for (graphlib::Edge edge : edges) {
+
+        const std::string inverse_name = reshape_op->name() + "_quant_remove_clone" + std::to_string(edge.edge_creation_id);
+        auto *clone_inverse = graph->add_node(reshape_op->clone(inverse_name), graph->get_subgraph_id_for_node(edge.consumer_node_id));
+        graphlib::OpNode *clone_inverse_op = dynamic_cast<graphlib::OpNode *>(clone_inverse);
+        clone_inverse_op->set_shape(commute_shape);
+        if (not below)
+            clone_inverse_op->tag("dont_erase", true);
+        update_reshape_attr(clone_inverse_op, commute_shape);
+        auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, edge, clone_inverse_op);
+        clone_inverse_op->set_output_df_from_operands(graph);
+
+
+        const std::string clone_name = reshape_op->name() + "_quant_remove_clone" + std::to_string(outgoing_edge.edge_creation_id);
+        graphlib::Node* clone = graph->add_node(reshape_op->clone(clone_name), graph->get_subgraph_id_for_node(edge.consumer_node_id));
+        graphlib::OpNode *clone_op = dynamic_cast<graphlib::OpNode *>(clone);
+        clone_op->set_shape(clone_shape);
+        update_reshape_attr(clone_op, clone_shape);
+        insert_node_on_edge(graph, outgoing_edge, clone_op);
+        clone_op->set_output_df_from_operands(graph);
+        if (below)
+            clone_op->tag("dont_erase", true);
+    }
+}
+
+void insert_inverse_pair(graphlib::Graph *graph, graphlib::OpNode *op, std::vector<graphlib::Edge> edges, graphlib::Shape commute_shape, graphlib::Shape clone_shape, bool below) {
+    if (op->op_name() == "transpose")
+        insert_inverse_transpose_pair(graph, op, edges, below);
+    else if (op->op_name() == "reshape")
+        insert_inverse_reshape_pair(graph, op, edges, commute_shape, clone_shape, below);
+    else {
+        TT_ASSERT(false, "Invalid Op passed");
+    }
+}
+
 bool insert_inverse_outside_quantized_region(graphlib::Graph *graph)
 {
     bool updated_anything = false;
@@ -132,7 +229,7 @@ bool insert_inverse_outside_quantized_region(graphlib::Graph *graph)
             if (not op)
                 continue;
 
-            if (op->op_name() != "transpose")
+            if (op->op_name() != "transpose" and op->op_name() != "reshape")
                 continue;
 
             if (not is_op_in_quantized_region(op))
@@ -140,18 +237,39 @@ bool insert_inverse_outside_quantized_region(graphlib::Graph *graph)
 
             if (std::find(ops_already_checked.begin(), ops_already_checked.end(), op) != ops_already_checked.end())
                 continue;
+            
+            auto user_out_data = find_downward_path_out(graph, op);
+            std::vector<graphlib::Edge> user_edges = std::get<0>(user_out_data);
+            graphlib::Shape commute_shape = std::get<1>(user_out_data);
+            graphlib::Shape clone_shape = std::get<2>(user_out_data);
 
-            std::vector<graphlib::Node*> downward_path = find_downward_path_out(graph, op);
-
-            if (not downward_path.empty()) {
+            if (not user_edges.empty()) {
                 // Insert inverse pair on all outgoing edges of last node in downward path
-                graphlib::Node *last_node = downward_path.back();
-                insert_inverse_pair_below(graph, op, graph->user_data_edges(last_node));
+                op->tag("dont_erase", false);
+
+                insert_inverse_pair(graph, op, user_edges, commute_shape, clone_shape, true);
                 ops_already_checked.push_back(op);
                 updated_anything = true;
                 attempt_update = true;
                 break;
             }
+            else {
+                auto operand_out_data = find_upward_path_out(graph, op);
+                std::vector<graphlib::Edge> operand_edges = std::get<0>(operand_out_data);
+                graphlib::Shape commute_shape = std::get<1>(operand_out_data);
+                graphlib::Shape clone_shape = std::get<2>(operand_out_data);
+
+                if (not operand_edges.empty()) {
+                    // Insert inverse pair on all outgoing edges of last node in downward path
+                    op->tag("dont_erase", false);
+
+                    insert_inverse_pair(graph, op, operand_edges, commute_shape, clone_shape, false);
+                    ops_already_checked.push_back(op);
+                    updated_anything = true;
+                    attempt_update = true;
+                    break;
+                }
+            }
 
         }
     }
diff --git a/pybuda/csrc/passes/make_quantized_ops.cpp b/pybuda/csrc/passes/make_quantized_ops.cpp
index c5be72c5..4232b78d 100644
--- a/pybuda/csrc/passes/make_quantized_ops.cpp
+++ b/pybuda/csrc/passes/make_quantized_ops.cpp
@@ -82,6 +82,10 @@ bool is_quantizeable_conv2d(graphlib::Graph *graph, graphlib::Node *conv2d) {
         if (operand_op->op_type().op != "dequantize")
             return false;
     }
+    
+    // If three is no bias then it is quantizeable, since we already know the act and weight are dequantize ops
+    if (graph->data_operands(conv2d).size() == 2)
+        return true;
 
     // The scale of the bias dequant must be equal to the product of the scales of the act and weight
     graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
@@ -227,11 +231,15 @@ void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
 
     graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
     graphlib::OpNode *deq_weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
-    graphlib::OpNode *deq_bias = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[2]);
+    graphlib::OpNode *deq_bias = nullptr;
+    if (graph->data_operands(conv2d).size() == 3)
+        deq_bias = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[2]);
 
     graphlib::Node *deq_act_scale = graph->data_operands(deq_act)[1];
     graphlib::Node *deq_weight_scale = graph->data_operands(deq_weight)[1];
-    graphlib::Node *deq_bias_scale = graph->data_operands(deq_bias)[1];
+    graphlib::Node *deq_bias_scale = nullptr;
+    if (graph->data_operands(conv2d).size() == 3)
+        deq_bias_scale = graph->data_operands(deq_bias)[1];
 
     // We convert the dequant axis to to a negative index because the conv
     // shape size might be larger than the shape of deq1 
@@ -293,14 +301,16 @@ void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
     // Remove scale edges so that bypass node works (it requires that the node has one operand)
     graphlib::Edge old_deq_act_scale_edge = retrieve_between_edge(graph, deq_act_scale, deq_act);
     graphlib::Edge old_deq_weight_scale_edge = retrieve_between_edge(graph, deq_weight_scale, deq_weight);
-    graphlib::Edge old_deq_bias_scale_edge = retrieve_between_edge(graph, deq_bias_scale, deq_bias);
     graph->remove_edge(old_deq_act_scale_edge);
     graph->remove_edge(old_deq_weight_scale_edge);
-    graph->remove_edge(old_deq_bias_scale_edge);
-
+    if (deq_bias) {
+        graphlib::Edge old_deq_bias_scale_edge = retrieve_between_edge(graph, deq_bias_scale, deq_bias);
+        graph->remove_edge(old_deq_bias_scale_edge);
+    }
     bypass_node(graph, deq_act, true);
     bypass_node(graph, deq_weight, true);
-    bypass_node(graph, deq_bias, true);
+    if (deq_bias)
+        bypass_node(graph, deq_bias, true);
     conv2d->set_output_df(DataFormat::Int32);
 }
 
diff --git a/pybuda/csrc/passes/move_requantize.cpp b/pybuda/csrc/passes/move_requantize.cpp
index dc8c3337..4972c489 100644
--- a/pybuda/csrc/passes/move_requantize.cpp
+++ b/pybuda/csrc/passes/move_requantize.cpp
@@ -185,7 +185,7 @@ bool move_tm_through_requantize(graphlib::Graph *graph) {
             if (not op)
                 continue;
 
-            if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+            if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                 continue;
 
             if (op->op_name() != "reshape" and op->op_name() != "transpose")
diff --git a/pybuda/csrc/passes/pre_lowering_passes.cpp b/pybuda/csrc/passes/pre_lowering_passes.cpp
index a6c81de8..a276d996 100644
--- a/pybuda/csrc/passes/pre_lowering_passes.cpp
+++ b/pybuda/csrc/passes/pre_lowering_passes.cpp
@@ -547,7 +547,7 @@ static bool has_fusable_upstream_matmul(graphlib::Graph *graph, graphlib::PyOpNo
     while (not (op->is_dense_matmul() || (op->is_depthwise_matmul() and not requant))) // requant can't be fused to depthwise
     {
         if (not (commutable_reshape(op))) {
-            if (not (requant and op->is_tm()))  // requant can be commuted through TM
+            if (not (requant and op->is_tm() and op->op_name() != "narrow"))  // requant can be commuted through TM
                 return false;
         }
 
diff --git a/pybuda/csrc/passes/remove_quant_dequant.cpp b/pybuda/csrc/passes/remove_quant_dequant.cpp
index 0e061337..98985775 100644
--- a/pybuda/csrc/passes/remove_quant_dequant.cpp
+++ b/pybuda/csrc/passes/remove_quant_dequant.cpp
@@ -41,14 +41,88 @@ void bypass_qdq_pair(graphlib::Graph *graph, graphlib::OpNode *quantize, graphli
         }
     };
 
-    // Purge the scale of one before the other. This way if both quant and dequant point to the same scale (directly or indirectly),
-    // the first call to purge_scale_graph will do nothing as the scale has multiple users. After the quantize
-    // is bypassed, when wecall purge_scale_graph again the sale will be erased as the edge that was once
-    // pointing to the quantize is gone (thanks to bypass_node).
-    purge_scale_graph(graph->data_operands(quantize)[1]);
-    bypass_node(graph, quantize, true);
-    purge_scale_graph(graph->data_operands(dequantize)[1]);
-    bypass_node(graph, dequantize, true);
+    graphlib::TaggedNode *quant_scale = graph->data_operands(quantize)[1]->as<graphlib::TaggedNode>();
+    graphlib::TaggedNode *dequant_scale = graph->data_operands(dequantize)[1]->as<graphlib::TaggedNode>();
+
+    // If we can be certain that the scales have the same value then we can just drop them from the graph
+    bool scales_are_same_node = quant_scale == dequant_scale;
+    bool can_drop_scales = scales_are_same_node;
+    if (not can_drop_scales) {
+        if (quant_scale->has_tag("forked_from") and dequant_scale->has_tag("forked_from")) {
+            can_drop_scales = quant_scale->tag_value("forked_from") == dequant_scale->tag_value("forked_from");
+        }
+        else {
+            can_drop_scales = false;
+        }
+    }
+
+    if (can_drop_scales) {
+        // Purge the scale of one before the other. This way if both quant and dequant point to the same scale (directly or indirectly),
+        // the first call to purge_scale_graph will do nothing as the scale has multiple users. After the quantize
+        // is bypassed, when wecall purge_scale_graph again the sale will be erased as the edge that was once
+        // pointing to the quantize is gone (thanks to bypass_node).
+        purge_scale_graph(graph->data_operands(quantize)[1]);
+        bypass_node(graph, quantize, true);
+        purge_scale_graph(graph->data_operands(dequantize)[1]);
+        bypass_node(graph, dequantize, true);
+    } else {
+        // If we cannot be certain that the scales are equal. Then we must divide the dequant scale by the quant scale and multiply
+        // the activations.
+        graphlib::Edge dequant_scale_edge = retrieve_between_edge(graph, dequant_scale, dequantize);
+        graphlib::Edge quant_scale_edge = retrieve_between_edge(graph, quant_scale, quantize);
+
+        std::string quant_scale_recip_name = "quantize_scale_reciprocal_" + quant_scale->name();
+        graphlib::OpNode *quant_scale_recip = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(quant_scale_recip_name, "reciprocal"), 
+                                            graph->get_subgraph_id_for_node(quantize->id()));
+
+        graph->add_edge(quant_scale, quant_scale_recip);
+        quant_scale_recip->set_shape(quant_scale->shape());
+        quant_scale_recip->set_output_df_from_operands(graph);
+
+        std::string scale_multiply_name = "multiply_scales_" + quant_scale->name() + "_" + dequant_scale->name();
+        graphlib::OpNode *scale_multiply = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(scale_multiply_name, "multiply"), 
+                                            graph->get_subgraph_id_for_node(quantize->id()));
+
+        uint32_t max_scale_shape = std::max<uint32_t>(dequant_scale->shape()[0], quant_scale->shape()[0]);
+        graphlib::Shape scale_miltiply_shape = graphlib::Shape::create(std::vector<uint32_t>{max_scale_shape});
+        scale_multiply->set_shape(scale_miltiply_shape);
+
+        graph->add_edge(dequant_scale, scale_multiply);
+        graph->add_edge(quant_scale_recip, scale_multiply);
+        scale_multiply->set_output_df_from_operands(graph);
+
+        // Potentially add broadcast on scale edge if one of the scales is not shaped [1]
+        if (dequant_scale->shape()[0] != quant_scale->shape()[0]) {
+            TT_ASSERT(dequant_scale->shape()[0] == 1 or quant_scale->shape()[0] == 1, "Cannot multiply differently shaped tensors if the dim of one of them is not 1");
+
+            if (dequant_scale->shape()[0] > quant_scale->shape()[0]) {
+                graphlib::Edge edge = retrieve_between_edge(graph, dequant_scale, scale_multiply);
+                graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+            }
+            else {
+                graphlib::Edge edge = retrieve_between_edge(graph, quant_scale_recip, scale_multiply);
+                graph->get_edge_attributes(edge)->set_broadcast_dim(-1, max_scale_shape);
+            }
+        }
+
+        // Now create the op which multiplies the scales with the bias
+        std::string bias_multiply_name = "bias_qdq_bypass_scale_multiply_" + quantize->name() + "_" + dequantize->name();
+        graphlib::OpNode *bias_multiply = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(bias_multiply_name, "multiply"), 
+                                            graph->get_subgraph_id_for_node(quantize->id()));
+
+        graphlib::Node *bias = graph->data_operands(quantize)[0];
+        graph->add_edge(scale_multiply, bias_multiply);
+        bias_multiply->set_shape(bias->shape());
+        bias_multiply->set_output_df_from_operands(graph);
+
+        graphlib::Edge bias_quant_edge = retrieve_between_edge(graph, bias, quantize);
+        insert_node_on_edge(graph, bias_quant_edge, bias_multiply);
+        
+        graph->remove_edge(dequant_scale_edge);
+        graph->remove_edge(quant_scale_edge);
+        bypass_node(graph, dequantize, true);
+        bypass_node(graph, quantize, true);
+    }
 }
 
 bool remove_quant_dequant(graphlib::Graph *graph) {
diff --git a/pybuda/pybuda/op/eval/buda/tm.py b/pybuda/pybuda/op/eval/buda/tm.py
index 90961e68..fde8b6f4 100644
--- a/pybuda/pybuda/op/eval/buda/tm.py
+++ b/pybuda/pybuda/op/eval/buda/tm.py
@@ -177,7 +177,8 @@ def eval(type, attr, ops):
         w = weights.shape[0]
         z = weights.shape[1]
         cout = weights.shape[3]
-        output_group = attr[3] // attr[0]
+        groups = attr[0]
+        output_group = attr[3] // groups
         weights = torch.nn.functional.pad(weights, (0, align_up_tile(cout) - cout))
 
         weights = weights.narrow(2, 0, attr[2])
@@ -185,8 +186,9 @@ def eval(type, attr, ops):
 
         weights = weights.reshape(w, z, -1, weights.shape[-1])
         weights_sections = torch.split(weights, output_group, dim=-1)
-        new_weights = torch.zeros(w, z, align_up_tile(attr[0] * cin), align_up_tile(cout))
-        for i, section in enumerate(weights_sections):
+        new_weights = torch.zeros(w, z, align_up_tile(groups * cin), align_up_tile(cout))
+        for i in range(groups):
+            section = weights_sections[i]
             new_weights[
                 :,
                 :,
@@ -201,7 +203,7 @@ def eval(type, attr, ops):
         elif len(attr) == 5:
             weights = weights.transpose(1, 2)
             weights = weights.transpose(2, 3)
-            weights = weights.reshape(w,1, align_up_tile(attr[0] * cin), -1)
+            weights = weights.reshape(w,1, align_up_tile(groups * cin), -1)
         return weights
 
     if type == "conv2d_grouped_weights_bw":
diff --git a/pybuda/pybuda/op/eval/pybuda/convolution.py b/pybuda/pybuda/op/eval/pybuda/convolution.py
index 476a87fd..f221a353 100644
--- a/pybuda/pybuda/op/eval/pybuda/convolution.py
+++ b/pybuda/pybuda/op/eval/pybuda/convolution.py
@@ -281,6 +281,9 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
 
     # Disallow depthwise path when training, needs BW ops implementation
     depthwise = depthwise and not dc.is_training_enabled() and not is_convtranspose2d
+    
+    # Disallow depthwise if we are force disabling it via env var
+    depthwise = depthwise and ("PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP" not in os.environ or os.environ["PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP"] != "1")
 
     if channel_last:
         activations = dc.op("reshape", [activations], (w, 1, y * x, cin), output_df=activations.output_df)
diff --git a/pybuda/pybuda/op/eval/pybuda/quantize.py b/pybuda/pybuda/op/eval/pybuda/quantize.py
index f92a205b..97c264a7 100644
--- a/pybuda/pybuda/op/eval/pybuda/quantize.py
+++ b/pybuda/pybuda/op/eval/pybuda/quantize.py
@@ -21,7 +21,7 @@
 }
 
 STRING_TO_LOWER_LIMIT = {
-    "torch.int8": -127,
+    "torch.int8": -128,
     "torch.uint8": 0,
     "torch.int32": -2147483648,
     "torch.float32": -3.4028234663852886e+38,
diff --git a/pybuda/pybuda/op/eval/pybuda/tm.py b/pybuda/pybuda/op/eval/pybuda/tm.py
index 832f07a7..5fe19804 100644
--- a/pybuda/pybuda/op/eval/pybuda/tm.py
+++ b/pybuda/pybuda/op/eval/pybuda/tm.py
@@ -251,14 +251,16 @@ def eval(type, attr, ops):
         z = weights.shape[1]
         cin = weights.shape[2]
         cout = weights.shape[3]
-        output_group = cout // attr[0]
+        groups = attr[0]
+        output_group = cout // groups
 
         weights = torch.nn.functional.pad(weights, (0, align_up_tile(cout) - cout))
         weights = weights.reshape(w, z, -1, weights.shape[-1])
 
         weights_sections = torch.split(weights, output_group, dim=-1)
-        new_weights = torch.zeros(w, z, align_up_tile(attr[0] * cin), align_up_tile(cout))
-        for i, section in enumerate(weights_sections):
+        new_weights = torch.zeros(w, z, align_up_tile(groups * cin), align_up_tile(cout))
+        for i in range(groups):
+            section = weights_sections[i]
             new_weights[
                 :,
                 :,
@@ -275,7 +277,7 @@ def eval(type, attr, ops):
         elif len(attr) == 5:
             weights = weights.transpose(1, 2)
             weights = weights.transpose(2, 3)
-            weights = weights.reshape(w,1, align_up_tile(attr[0] * cin), -1)
+            weights = weights.reshape(w,1, align_up_tile(groups * cin), -1)
         return weights
 
     if type == "conv2d_grouped_weights_bw":
diff --git a/pybuda/pybuda/op/eval/pybuda/transpose.py b/pybuda/pybuda/op/eval/pybuda/transpose.py
index 619e34fd..450f8d1d 100644
--- a/pybuda/pybuda/op/eval/pybuda/transpose.py
+++ b/pybuda/pybuda/op/eval/pybuda/transpose.py
@@ -52,6 +52,12 @@ def lower(self, lc, tensors, outputs):
         if self.dim1 >= 0:
             self.dim1 -= tensors[0].shape.len()
 
+        # Buda requires least dim as dim0?
+        if self.dim0 > self.dim1:
+            tmp = self.dim0
+            self.dim0 = self.dim1
+            self.dim1 = tmp
+
         if self.dim0 == -2 and self.dim1 == -1:
             lc.tm(
                 BudaTransposeTM.create(self.dim0, self.dim1, z_dim_slice=self.z_dim_slice),
@@ -81,8 +87,9 @@ def decompose_post_optimize(self, dc, inputs):
         orig_shape = inputs[0].shape
         if (
             len(orig_shape) > 2
-            and self.dim0 == -3
-            and self.dim1 == -1
+            and (self.dim0 == -3
+            and self.dim1 == -1 or self.dim0 == -1
+            and self.dim1 == -3)
             and ((len(orig_shape) == 4 and orig_shape[-4] == 1) or len(orig_shape) < 4)
         ):
             # XZ transpose
diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
similarity index 50%
rename from pybuda/test/quantized/test_onnx_quantized_mobilenet.py
rename to pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
index e6b09015..758aeddb 100644
--- a/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
+++ b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
@@ -5,6 +5,7 @@
 import urllib
 
 import onnx
+import pybuda.module
 import pytest
 import numpy as np
 import onnxruntime
@@ -22,31 +23,29 @@
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
 
-def test_onnx_quantized_mb_v2_depth(test_device):
-    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+
+def test_onnx_qdq_mobilenet_v2(test_device):
+    # pytest.skip("Models not yet uploaded")
+    # pytest.skip("WIP")
     if test_device.arch == BackendDevice.Blackhole:
         pytest.skip("Blackhole does not support quantized models")
 
-    # Download ONNX model
-    save_path = "third_party/confidential_customer_models/quantized/mb_v2_depthwise-Int8.onnx"
-    if not os.path.exists(save_path):
-        raise RuntimeError("Model not found")
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv2/mobilenetv2_ptq_qdq.onnx"
 
-    # LOAD ONNX model
     onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
+    # onnx.checker.check_model(onnx_model)
     pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_mb_v2_depthwise",
+        "onnx_quantized_mobilenet_v2",
         onnx_model,
         save_path,
     )
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
-    if test_device.devtype == BackendType.Silicon:
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
+    os.environ["PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP"] = "1"
 
     # Sanity run
     input_shape = []
@@ -54,7 +53,8 @@ def test_onnx_quantized_mb_v2_depth(test_device):
         dimension = onnx_model.graph.input[i].type.tensor_type.shape
         i_shape = [d.dim_value for d in dimension.dim]
         input_shape.append(i_shape)
-    
+
+
     # Compile and verify
     verify_module(
         pybuda_onnx_model,
@@ -64,38 +64,70 @@ def test_onnx_quantized_mb_v2_depth(test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            enabled = False if test_device.devtype == BackendType.Silicon else True,
             # verify_pybuda_codegen_vs_framework=True,
-            # verify_all=True
+            verify_all=True,
+            pcc=0.96
         ),
     )
 
+class MobilenetV2QDQDepthwise(pybuda.module.PyBudaModule):
+    def __init__(self, name):
+        super().__init__(name)
+        self.add_parameter("conv_kernel", pybuda.Parameter(*(32, 1, 3, 3), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+        self.add_parameter("conv_bias", pybuda.Parameter(*(32,), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+        self.add_constant("const_00", shape=(1,))
+        self.set_constant("const_00", torch.tensor([1.0]))
+        self.add_constant("/features/features.1/conv/conv.0/scale", shape=(32,))
+        self.set_constant("/features/features.1/conv/conv.0/scale", torch.tensor([0.0409439280629158]))
+
+    def forward(self, img):
+
+        one = self.get_constant("const_00")
+        kernel = self.get_parameter("conv_kernel")
+        bias = self.get_parameter("conv_bias")
+        scale = self.get_constant("/features/features.1/conv/conv.0/scale")
+
+        # Muse use int8 inputs
+        img = pybuda.op.Quantize("", img, one, out_dtype=torch.int8, axis=0, zero_point=0.0)
+        kernel = pybuda.op.Quantize("", kernel, scale, out_dtype=torch.int8, axis=0, zero_point=0.0)
+        bias = pybuda.op.Quantize("", bias, scale, out_dtype=torch.int32, axis=0, zero_point=0.0)
+
+        # This Conv2d will decompose. One op included in the decomposition will be "depthwise"
+        out = pybuda.op.Conv2d("", img, kernel, bias, stride=[1, 1], padding=[1, 1, 1, 1], dilation=1, groups=32, channel_last=0)
+
+        # Output must be float32 or the output cannot be untilized
+        out = pybuda.op.Dequantize("", out, scale, out_dtype=torch.float32, axis=0, zero_point=0.0)
+        
+        # These are the inverse of what the output of the conv2d decompositon inserts.
+        # Puth this here so that there are no tms between the depthwise and the output
+        out = pybuda.op.Reshape("", out, (1, 1, 32, 12544))
+        out = pybuda.op.Transpose("", out, -1, -2)
+        return out
+    
+def test_depthwise(test_device):
 
-def test_onnx_quantized_mb_v2(test_device):
-    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+    # pytest.skip("Models not yet uploaded")
+    pytest.skip("Use this test to debug the int8 depthwise issue")
     if test_device.arch == BackendDevice.Blackhole:
         pytest.skip("Blackhole does not support quantized models")
 
-    # Download ONNX model
-    save_path = "third_party/confidential_customer_models/quantized/mobilenet_v2-Int8.onnx"
-    if not os.path.exists(save_path):
-        raise RuntimeError("Model not found")
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv2/mobilenetv2_ptq_qdq.onnx"
 
-    # LOAD ONNX model
     onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_mb_v2",
-        onnx_model,
-        save_path,
-    )
+    # onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = MobilenetV2QDQDepthwise("mbv2_depthwise")
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.retain_tvm_python_files = True
+    # compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
-    compiler_cfg.place_on_new_epoch("conv2d_122.dc.reshape.0.dc.sparse_matmul.14.lc2")
+    # compiler_cfg.place_on_new_epoch("conv2d_122.dc.reshape.0.dc.sparse_matmul.14.lc2")
     os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
     os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
@@ -108,59 +140,11 @@ def test_onnx_quantized_mb_v2(test_device):
         dimension = onnx_model.graph.input[i].type.tensor_type.shape
         i_shape = [d.dim_value for d in dimension.dim]
         input_shape.append(i_shape)
-    
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            enabled = False if test_device.devtype == BackendType.Silicon else True,
-            # verify_pybuda_codegen_vs_framework=True,
-            # verify_all=True
-        ),
-    )
-
-def test_onnx_qdq_mobilenet(test_device):
-    # pytest.skip("Models not yet uploaded")
-    pytest.skip("WIP")
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/mobilenetv2_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_mobilenet_v2",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
 
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
     # Compile and verify
     verify_module(
         pybuda_onnx_model,
-        input_shape,
+        [(1, 32, 112, 112)],
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
@@ -168,5 +152,6 @@ def test_onnx_qdq_mobilenet(test_device):
             test_kind=TestKind.INFERENCE,
             # verify_pybuda_codegen_vs_framework=True,
             verify_all=True,
+            pcc=0.96
         ),
     )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
new file mode 100644
index 00000000..7bfa7121
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import urllib
+
+import onnx
+import pybuda.module
+import pytest
+import numpy as np
+import onnxruntime
+import torch
+import pybuda
+from pybuda import (
+    OnnxModule,
+    VerifyConfig,
+    DataFormat,
+    BackendDevice,
+    BackendType,
+)
+from pybuda._C import MathFidelity
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+from pybuda.config import _get_global_compiler_config
+
+
+def find_init(init_names, model):
+    initializers = []
+    for init in model.graph.initializer:
+        if init.name in init_names:
+            initializers.append(init)
+    return initializers
+
+def find_node(node_name, model):
+    for node in model.graph.node:
+        if node.name == node_name:
+            return node
+    return None
+
+def find_idx(node_name, model):
+    for i, node in enumerate(model.graph.node):
+        if node.name == node_name:
+            return i
+    return None
+
+def test_onnx_qdq_mobilenet_v3(test_device):
+    # pytest.skip("Models not yet uploaded")
+    pytest.skip("WIP")
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv3/mobilenetv3_ptq_qdq.onnx"
+
+    onnx_model = onnx.load(save_path)
+
+    split_node_name = '/features/features.1/add2/Add'
+    split_node_idx = find_idx(split_node_name, onnx_model)
+    nodes_to_remove = []
+    for i in range(split_node_idx + 1, len(onnx_model.graph.node)):
+        nodes_to_remove.append(onnx_model.graph.node[i])
+    for node in nodes_to_remove:
+        onnx_model.graph.node.remove(node)
+
+    # import pdb; pdb.set_trace()
+    output_name = onnx_model.graph.output[0].name
+    onnx_model.graph.output.pop()
+    new_output = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.FLOAT, [1,120,28,28])
+    onnx_model.graph.output.append(new_output)
+    last_node = find_node(split_node_name, onnx_model)
+    last_node.output[0] = output_name
+
+
+    
+
+    onnx.checker.check_model(onnx_model)
+    onnx.save(onnx_model, "./chopped_mobilenetv3.onnx")
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_mobilenet_v3",
+        onnx_model,
+        "./chopped_mobilenetv3.onnx",
+    )
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.retain_tvm_python_files = True
+    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
+    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
+    # os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    # os.environ["PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+
+
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            # verify_tvm_compile=True,
+            # verify_all=True,
+        ),
+    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_regnet_y.py b/pybuda/test/quantized/test_onnx_quantized_regnet_y.py
new file mode 100644
index 00000000..d97f63d6
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_quantized_regnet_y.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import urllib
+
+import onnx
+import pytest
+import numpy as np
+import onnxruntime
+import torch
+import pybuda
+from pybuda import (
+    OnnxModule,
+    VerifyConfig,
+    DataFormat,
+    BackendDevice,
+)
+from pybuda._C import MathFidelity
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+from pybuda.config import _get_global_compiler_config
+
+def test_onnx_qdq_regnet_y(test_device):
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Grayskull does not support quantized models")
+
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/regnet_y_32gf/regnet_y_32gf_ptq_qdq.onnx"
+
+    onnx_model = onnx.load(save_path)
+    # onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_qdq_regnet_y",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.retain_tvm_python_files = True
+    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            # verify_pybuda_codegen_vs_framework=True,
+            # verify_all=True,
+        ),
+        input_params=[{"requires_grad": False}]
+    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_resnet.py b/pybuda/test/quantized/test_onnx_quantized_resnet.py
index e63b3e7a..d5ba8468 100644
--- a/pybuda/test/quantized/test_onnx_quantized_resnet.py
+++ b/pybuda/test/quantized/test_onnx_quantized_resnet.py
@@ -21,100 +21,19 @@
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
 
-
-
-
-def test_onnx_quantized_resnet(test_device):
-    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip()
-
-    # Download ONNX model
-    save_path = "third_party/confidential_customer_models/quantized/ResNet50-v1.5-Int8.onnx"
-    if not os.path.exists(save_path):
-        raise RuntimeError("Model not found")
-
-    # LOAD ONNX model
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_ResNet50",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_df_override = DataFormat.Float32
-
-    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH"] = "1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_INPUT"] = "quantize_0.dc.buda_quantize.1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_OUTPUT"] = "conv2d_1.dc.matmul.11"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
-    # tti_path = "onnx_int8_resnet50_epoch_0.tti"
-    # if not os.path.exists(tti_path):
-    #     tt_module = pybuda_onnx_model
-    #     device = pybuda.TTDevice(
-    #         "tt0", module=tt_module,arch=pybuda.BackendDevice.Wormhole_B0, devtype=pybuda.BackendType.Silicon)
-    #     tti_img = device.compile_to_image(
-    #         img_path=tti_path,
-    #         training=False,
-    #         sample_inputs=[torch.randn(shape) for shape in input_shape],
-    #     )
-
-
-    # device_img: pybuda.TTDeviceImage = pybuda.TTDeviceImage.load_from_disk(tti_path)
-    # ttdevice = pybuda.TTDevice.load_image(img=device_img)
-
-    # inputs = [torch.randn(shape) for shape in input_shape]
-    # ttdevice.push_to_inputs(*inputs)
-    # output_q = pybuda.run_inference(_sequential=True)
-    # output = output_q.get()[0].value().detach()
-
-    # golden_output = pybuda_onnx_model.forward(*inputs)
-    # assert np.allclose(output, golden_output[0], atol=1e-3, rtol=1e-3)
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-        ),
-    )
-
 def test_onnx_qdq_resnet(test_device):
-    pytest.skip("WIP")
     if test_device.arch == BackendDevice.Blackhole:
         pytest.skip("Blackhole does not support quantized models")
 
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull does not support quantized models")
 
-    save_path = "third_party/confidential_customer_models/bos/resnet50_ptq_qdq.onnx"
+    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/resnet50/resnet50_ptq_qdq.onnx"
 
     onnx_model = onnx.load(save_path)
     # onnx.checker.check_model(onnx_model)
     pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_ResNet50",
+        "onnx_quantized_qdq_ResNet50",
         onnx_model,
         save_path,
     )
@@ -144,5 +63,6 @@ def test_onnx_qdq_resnet(test_device):
             test_kind=TestKind.INFERENCE,
             # verify_pybuda_codegen_vs_framework=True,
             verify_all=True,
+            pcc=0.97
         ),
     )
\ No newline at end of file

From 2c1ecfa3dab173577969f98074f62f39c6ac9e38 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Thu, 25 Jul 2024 16:16:19 +0000
Subject: [PATCH 052/116] Moved quantized tests

(cherry picked from commit 23c069197bdbc62cce1bfd06c5db867ac7cac969)
---
 .../test_onnx_quantized_mobilenet_v2.py       | 157 ------------------
 .../test_onnx_quantized_mobilenet_v3.py       | 116 -------------
 .../quantized/test_onnx_quantized_regnet_y.py |  69 --------
 .../quantized/test_onnx_quantized_resnet.py   |  68 --------
 .../test/quantized/test_onnx_quantized_vit.py | 121 --------------
 5 files changed, 531 deletions(-)
 delete mode 100644 pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
 delete mode 100644 pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
 delete mode 100644 pybuda/test/quantized/test_onnx_quantized_regnet_y.py
 delete mode 100644 pybuda/test/quantized/test_onnx_quantized_resnet.py
 delete mode 100644 pybuda/test/quantized/test_onnx_quantized_vit.py

diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
deleted file mode 100644
index 758aeddb..00000000
--- a/pybuda/test/quantized/test_onnx_quantized_mobilenet_v2.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import urllib
-
-import onnx
-import pybuda.module
-import pytest
-import numpy as np
-import onnxruntime
-import torch
-import pybuda
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    DataFormat,
-    BackendDevice,
-    BackendType,
-)
-from pybuda._C import MathFidelity
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import _get_global_compiler_config
-
-
-def test_onnx_qdq_mobilenet_v2(test_device):
-    # pytest.skip("Models not yet uploaded")
-    # pytest.skip("WIP")
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv2/mobilenetv2_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_mobilenet_v2",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    os.environ["PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-
-
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-            pcc=0.96
-        ),
-    )
-
-class MobilenetV2QDQDepthwise(pybuda.module.PyBudaModule):
-    def __init__(self, name):
-        super().__init__(name)
-        self.add_parameter("conv_kernel", pybuda.Parameter(*(32, 1, 3, 3), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
-        self.add_parameter("conv_bias", pybuda.Parameter(*(32,), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
-        self.add_constant("const_00", shape=(1,))
-        self.set_constant("const_00", torch.tensor([1.0]))
-        self.add_constant("/features/features.1/conv/conv.0/scale", shape=(32,))
-        self.set_constant("/features/features.1/conv/conv.0/scale", torch.tensor([0.0409439280629158]))
-
-    def forward(self, img):
-
-        one = self.get_constant("const_00")
-        kernel = self.get_parameter("conv_kernel")
-        bias = self.get_parameter("conv_bias")
-        scale = self.get_constant("/features/features.1/conv/conv.0/scale")
-
-        # Muse use int8 inputs
-        img = pybuda.op.Quantize("", img, one, out_dtype=torch.int8, axis=0, zero_point=0.0)
-        kernel = pybuda.op.Quantize("", kernel, scale, out_dtype=torch.int8, axis=0, zero_point=0.0)
-        bias = pybuda.op.Quantize("", bias, scale, out_dtype=torch.int32, axis=0, zero_point=0.0)
-
-        # This Conv2d will decompose. One op included in the decomposition will be "depthwise"
-        out = pybuda.op.Conv2d("", img, kernel, bias, stride=[1, 1], padding=[1, 1, 1, 1], dilation=1, groups=32, channel_last=0)
-
-        # Output must be float32 or the output cannot be untilized
-        out = pybuda.op.Dequantize("", out, scale, out_dtype=torch.float32, axis=0, zero_point=0.0)
-        
-        # These are the inverse of what the output of the conv2d decompositon inserts.
-        # Puth this here so that there are no tms between the depthwise and the output
-        out = pybuda.op.Reshape("", out, (1, 1, 32, 12544))
-        out = pybuda.op.Transpose("", out, -1, -2)
-        return out
-    
-def test_depthwise(test_device):
-
-    # pytest.skip("Models not yet uploaded")
-    pytest.skip("Use this test to debug the int8 depthwise issue")
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv2/mobilenetv2_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = MobilenetV2QDQDepthwise("mbv2_depthwise")
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.retain_tvm_python_files = True
-    # compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
-    # compiler_cfg.place_on_new_epoch("conv2d_122.dc.reshape.0.dc.sparse_matmul.14.lc2")
-    os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
-    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
-    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
-    if test_device.devtype == BackendType.Silicon:
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{96*1024}"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        [(1, 32, 112, 112)],
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-            pcc=0.96
-        ),
-    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
deleted file mode 100644
index 7bfa7121..00000000
--- a/pybuda/test/quantized/test_onnx_quantized_mobilenet_v3.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import urllib
-
-import onnx
-import pybuda.module
-import pytest
-import numpy as np
-import onnxruntime
-import torch
-import pybuda
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    DataFormat,
-    BackendDevice,
-    BackendType,
-)
-from pybuda._C import MathFidelity
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import _get_global_compiler_config
-
-
-def find_init(init_names, model):
-    initializers = []
-    for init in model.graph.initializer:
-        if init.name in init_names:
-            initializers.append(init)
-    return initializers
-
-def find_node(node_name, model):
-    for node in model.graph.node:
-        if node.name == node_name:
-            return node
-    return None
-
-def find_idx(node_name, model):
-    for i, node in enumerate(model.graph.node):
-        if node.name == node_name:
-            return i
-    return None
-
-def test_onnx_qdq_mobilenet_v3(test_device):
-    # pytest.skip("Models not yet uploaded")
-    pytest.skip("WIP")
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/mobilenetv3/mobilenetv3_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-
-    split_node_name = '/features/features.1/add2/Add'
-    split_node_idx = find_idx(split_node_name, onnx_model)
-    nodes_to_remove = []
-    for i in range(split_node_idx + 1, len(onnx_model.graph.node)):
-        nodes_to_remove.append(onnx_model.graph.node[i])
-    for node in nodes_to_remove:
-        onnx_model.graph.node.remove(node)
-
-    # import pdb; pdb.set_trace()
-    output_name = onnx_model.graph.output[0].name
-    onnx_model.graph.output.pop()
-    new_output = onnx.helper.make_tensor_value_info(output_name, onnx.TensorProto.FLOAT, [1,120,28,28])
-    onnx_model.graph.output.append(new_output)
-    last_node = find_node(split_node_name, onnx_model)
-    last_node.output[0] = output_name
-
-
-    
-
-    onnx.checker.check_model(onnx_model)
-    onnx.save(onnx_model, "./chopped_mobilenetv3.onnx")
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_mobilenet_v3",
-        onnx_model,
-        "./chopped_mobilenetv3.onnx",
-    )
-
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
-    # os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
-    # os.environ["PYBUDA_DISABLE_DEPTHWISE_CONV2D_DECOMP"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-
-
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            verify_pybuda_codegen_vs_framework=True,
-            # verify_tvm_compile=True,
-            # verify_all=True,
-        ),
-    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_regnet_y.py b/pybuda/test/quantized/test_onnx_quantized_regnet_y.py
deleted file mode 100644
index d97f63d6..00000000
--- a/pybuda/test/quantized/test_onnx_quantized_regnet_y.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import urllib
-
-import onnx
-import pytest
-import numpy as np
-import onnxruntime
-import torch
-import pybuda
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    DataFormat,
-    BackendDevice,
-)
-from pybuda._C import MathFidelity
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import _get_global_compiler_config
-
-def test_onnx_qdq_regnet_y(test_device):
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/regnet_y_32gf/regnet_y_32gf_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_qdq_regnet_y",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            # verify_all=True,
-        ),
-        input_params=[{"requires_grad": False}]
-    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_resnet.py b/pybuda/test/quantized/test_onnx_quantized_resnet.py
deleted file mode 100644
index d5ba8468..00000000
--- a/pybuda/test/quantized/test_onnx_quantized_resnet.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import urllib
-
-import onnx
-import pytest
-import numpy as np
-import onnxruntime
-import torch
-import pybuda
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    DataFormat,
-    BackendDevice,
-)
-from pybuda._C import MathFidelity
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import _get_global_compiler_config
-
-def test_onnx_qdq_resnet(test_device):
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/resnet50/resnet50_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_qdq_ResNet50",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-            pcc=0.97
-        ),
-    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_vit.py b/pybuda/test/quantized/test_onnx_quantized_vit.py
deleted file mode 100644
index c43382cd..00000000
--- a/pybuda/test/quantized/test_onnx_quantized_vit.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-
-import onnx
-import pytest
-from pybuda import (
-    OnnxModule,
-    VerifyConfig,
-    DataFormat,
-    BackendDevice,
-    BackendType,
-)
-from pybuda._C import MathFidelity
-from pybuda.verify import verify_module
-from pybuda.verify.config import TestKind
-from pybuda.config import _get_global_compiler_config
-
-def test_int8_onnx_vit_calibrated(test_device):
-    pytest.skip("Not continuing support for QOperator models")
-    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip()
-
-    # Download ONNX model
-    save_path = "third_party/confidential_customer_models/quantized/vit-Int8-calibrated.onnx"
-    if not os.path.exists(save_path):
-        raise RuntimeError("Model not found")
-
-    # LOAD ONNX model
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_vit_calibrated",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_df_override = DataFormat.Float32
-
-    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-
-
-    # Compile and verify
-    pcc = 0.97 if test_device.devtype == BackendType.Silicon else 0.99
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            pcc=pcc,
-        ),
-    )
-
-
-def test_onnx_qdq_vit(test_device):
-    if test_device.arch == BackendDevice.Blackhole:
-        pytest.skip("Blackhole does not support quantized models")
-
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull does not support quantized models")
-
-    save_path = "third_party/confidential_customer_models/bos/bos_onnx_062524/priorityA/vit_b_16/vit_b_16_ptq_qdq.onnx"
-
-    onnx_model = onnx.load(save_path)
-    # onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_vit",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_math_fidelity = MathFidelity.HiFi4
-    compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.convert_framework_params_to_tvm = True
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=TestKind.INFERENCE,
-            # verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-        ),
-    )
\ No newline at end of file

From a648e56470cbe4751bc531a27d8e47a8fa07c090 Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Fri, 26 Jul 2024 13:47:23 +0000
Subject: [PATCH 053/116] Add missing operators in element-wise binary
 operators test according to test plan

(cherry picked from commit f7ee170dea780c3be86c84a2d71709f5a02b06e3)
---
 .../test/operators/eltwise_binary/conftest.py |  13 +-
 .../operators/eltwise_binary/test_command.sh  |  10 ++
 .../eltwise_binary/test_eltwise_binary.py     | 157 +++++++++++++++---
 3 files changed, 158 insertions(+), 22 deletions(-)

diff --git a/pybuda/test/operators/eltwise_binary/conftest.py b/pybuda/test/operators/eltwise_binary/conftest.py
index 4642e6c8..9258c255 100644
--- a/pybuda/test/operators/eltwise_binary/conftest.py
+++ b/pybuda/test/operators/eltwise_binary/conftest.py
@@ -34,6 +34,13 @@ def pytest_addoption(parser):
         default=[1, 16, 32, 64], 
         help="Shape of the tensor."
     )
+    # shape prologued
+    parser.addoption(
+        "--bin_shape_prologued", 
+        action="store", 
+        default='((2, 3, 3), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False)', 
+        help="Shape of the tensor, input source plag, should prolog flag."
+    )
     # operation
     parser.addoption(
         "--bin_op",
@@ -59,7 +66,11 @@ def pytest_generate_tests(metafunc):
 	option_shape = metafunc.config.option.bin_shape
 	if 'bin_shape' in metafunc.fixturenames and option_shape is not None:
 		metafunc.parametrize("bin_shape", [option_shape])
+		
+	option_shape_prologued = metafunc.config.option.bin_shape_prologued
+	if 'bin_shape_prologued' in metafunc.fixturenames and option_shape_prologued is not None:
+		metafunc.parametrize("bin_shape_prologued", [option_shape_prologued])
 
 	option_op = metafunc.config.option.bin_op
 	if 'bin_op' in metafunc.fixturenames and option_op is not None:
-		metafunc.parametrize("bin_op", [option_op])
\ No newline at end of file
+		metafunc.parametrize("bin_op", [option_op])
diff --git a/pybuda/test/operators/eltwise_binary/test_command.sh b/pybuda/test/operators/eltwise_binary/test_command.sh
index bdd2eea8..9114addf 100644
--- a/pybuda/test/operators/eltwise_binary/test_command.sh
+++ b/pybuda/test/operators/eltwise_binary/test_command.sh
@@ -33,3 +33,13 @@ pytest -svv test_eltwise_binary_single.py --bin_model model_4 --bin_train True -
 pytest -svv test_eltwise_binary_single.py --bin_model model_4 --bin_train True --bin_recompute False --bin_op 'Add' --bin_shape '[108, 13, 73]'
 
 pytest -svv test_eltwise_binary_single.py --bin_model model_1 --bin_train True --bin_recompute False --bin_op 'Add' --bin_shape '[1, 1, 10000, 10000]'
+
+# Run single test according to the new test plan 
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelOpSrcFromTmEdge1' --bin_shape '(1, 1000, 100)' --bin_op "Heaviside" --runxfail --no-skips
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelFromAnotherOp' --bin_shape '(1, 1, 9920, 1)' --bin_op "Equal" --runxfail --no-skips
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelFromAnotherOp' --bin_shape '(1, 1, 9920, 1)' --bin_op "NotEqual" --runxfail --no-skips
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelFromAnotherOp' --bin_shape '(1, 3, 3)' --bin_op "BinaryStack" --runxfail --no-skips
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelConstEvalPass' --bin_shape '(1, 3, 3, 3)' --bin_op "BinaryStack" --runxfail --no-skips
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single --bin_model 'ModelFromAnotherOp' --bin_shape '(1, 1, 10, 1000)' --bin_op "BinaryStack" --runxfail --no-skips
+
+pytest -svv test_eltwise_binary.py::test_eltwise_binary_ops_per_test_plan_single_prologued --bin_shape_prologued '((2, 3, 3), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False)' --bin_op "BinaryStack" --runxfail --no-skips
diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index c7311257..8804fb3b 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -84,18 +84,19 @@ class ModelFromAnotherOp(PyBudaModule):
 
     model_name = "model_op_src_from_another_op"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from another op")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from another op"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
     def forward(self, x, y):
         # we use Add and Subtract operators to create two operands which are inputs for the binary operator
         xx = pybuda.op.Add("Add0", x, y)
         yy = pybuda.op.Subtract("Subtract0", x, y)
-        output = self.operator(self.opname + "1", xx, yy)
+        output = self.operator(self.opname + "1", xx, yy, **self.kwargs)
         return output
 
 
@@ -103,15 +104,16 @@ class ModelFromHost(PyBudaModule):
 
     model_name = "model_op_src_from_host"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from host")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from host"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
     def forward(self, x, y):
-        output = self.operator(self.opname + "0", x, y)
+        output = self.operator(self.opname + "0", x, y, **self.kwargs)
         return output
 
 
@@ -119,15 +121,16 @@ class ModelFromDramQueue(PyBudaModule):
 
     model_name = "model_op_src_from_dram_queue"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
     def forward(self, x, y):
-        output = self.operator(self.opname + "0", x, y)
+        output = self.operator(self.opname + "0", x, y, **self.kwargs)
         return output
 
 
@@ -135,12 +138,13 @@ class ModelFromDramQueuePrologued(PyBudaModule):
 
     model_name = "model_op_src_from_dram_queue_prologued"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue prologued")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue prologued"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
         def my_rand(*shape, requires_grad=False):
             return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
@@ -151,7 +155,7 @@ def my_rand(*shape, requires_grad=False):
         self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
 
     def forward(self, x):
-        output = self.operator(self.opname + "0", self.get_constant("c"), x)
+        output = self.operator(self.opname + "0", self.get_constant("c"), x, **self.kwargs)
         return output
 
 
@@ -159,12 +163,13 @@ class ModelConstEvalPass(PyBudaModule):
 
     model_name = "model_op_src_const_eval_pass"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src const eval pass")
         self.testname = "Element-wise binary operator " + opname + " test _ op src const eval pass"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
         def my_rand(*shape, requires_grad=False):
             return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
@@ -182,7 +187,7 @@ def my_rand(*shape, requires_grad=False):
         ]
 
     def forward(self, x, y):
-        v1 = self.operator(self.opname + "0", self.get_constant("c1"), self.get_constant("c2"))
+        v1 = self.operator(self.opname + "0", self.get_constant("c1"), self.get_constant("c2"), **self.kwargs)
         # v2 and v3 consume inputs
         v2 = pybuda.op.Add("Add1", x, y)
         v3 = pybuda.op.Add("Add2", v1, v2)
@@ -193,17 +198,18 @@ class ModelOpSrcFromTmEdge1(PyBudaModule):
 
     model_name = "model_op_src_from_tm_edge1"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge1")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge1"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
     def forward(self, x, y):
         xx = pybuda.op.Add("Add0", x, y)
         yy = pybuda.op.tm.Transpose("Transpose0", xx, -1, -2)
-        output = self.operator(self.opname + "1", yy, yy)
+        output = self.operator(self.opname + "1", yy, yy, **self.kwargs)
         return output
 
 
@@ -211,18 +217,18 @@ class ModelOpSrcFromTmEdge2(PyBudaModule):
 
     model_name = "model_op_src_from_tm_edge2"
 
-    def __init__(self, operator, opname, shape):
+    def __init__(self, operator, opname, shape, kwargs):
         super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge2")
         self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge2"
         self.operator = operator
         self.opname = opname
         self.shape = shape
+        self.kwargs = kwargs
 
     def forward(self, x, y):
-        # 
         xx = pybuda.op.tm.Transpose("Transpose0", x, -1, -2)
         yy = pybuda.op.tm.Transpose("Transpose1", y, -1, -2)
-        output = self.operator(self.opname + "2", xx, yy)
+        output = self.operator(self.opname + "2", xx, yy, **self.kwargs)
         return output
 
 
@@ -232,6 +238,7 @@ def verify(
     input_operator: str,
     input_shape: TensorShape,
     number_of_operands: int,
+    kwargs: Dict = {},
     input_params: List[Dict] = [],
     input_source_flag: InputSourceFlags = None,
     dev_data_format: pybuda.DataFormat = None,
@@ -240,7 +247,8 @@ def verify(
     '''Common verification function for all tests'''
 
     operator = getattr(pybuda.op, input_operator)
-    model = model_type(operator=operator, opname=input_operator, shape=input_shape)
+
+    model = model_type(operator=operator, opname=input_operator, shape=input_shape, kwargs=kwargs)
 
     input_shapes = tuple([input_shape for _ in range(number_of_operands)])
     logger.trace(f"***input_shapes: {input_shapes}")
@@ -283,6 +291,8 @@ def get_eltwise_binary_ops():
         "LessEqual",        #10
         "Equal",            #11
         "NotEqual",         #12
+        "Divide",           #13
+        "BinaryStack",      #14
     ]
 
 def get_input_shapes():
@@ -406,12 +416,25 @@ def test_eltwise_binary_ops_per_test_plan(
     # Observed Bugs: --------------------------------------------------------------------------------------------------------------------
     # 1. input_shape in ((1, 1000, 100), (10, 1000, 100)):
     if model_type == ModelOpSrcFromTmEdge1 and input_operator == "Heaviside" and input_shape in (s[30], s[43]):
-        pytest.xfail(reason="RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/policies/policy_utils.cpp:2221: " + 
-                            "graph ->get_edges( graph->get_node_by_name(nopInsertInst->src), " +
-                            "graph->get_node_by_name(nopInsertInst->dest)) .size() == 1")
+        # Error Message: "RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/policies/policy_utils.cpp:2221: " + 
+        #                "graph ->get_edges( graph->get_node_by_name(nopInsertInst->src), " +
+        #                "graph->get_node_by_name(nopInsertInst->dest)) .size() == 1"
+        pytest.xfail(reason="Buggy shapes for ModelOpSrcFromTmEdge1.")
     # 2. input_shape in ((1, 9920, 1), (1, 1, 9920, 1), (9, 1, 9920, 1)):
     if model_type == ModelFromAnotherOp and input_operator in ["Equal", "NotEqual"] and input_shape in (s[32], s[56], s[69]):
-        pytest.xfail(reason="RuntimeError: Fatal balancer error: Could not reconcile constraints: path[Add0 -> _fused_op_0]")
+        # Error Mesage: "RuntimeError: Fatal balancer error: Could not reconcile constraints: path[Add0 -> _fused_op_0]"
+        pytest.xfail(reason="Buggy shapes for ModelFromAnotherOp.")
+    # 3. BinaryStack bugs:
+    if input_operator == "BinaryStack":
+        if len(input_shape) in (2, 3):
+            # input_shapes are 2-dimensional and 3-dimensional:
+            pytest.xfail(reason="BinaryStack operator is not working for 2D and 3D shapes.")
+        elif model_type == ModelConstEvalPass:
+            # model_type is ModelConstEvalPass:
+            pytest.xfail(reason="BinaryStack operator is not working for ModelConstEvalPass.")
+        elif input_shape in (s[55], s[56], s[57], s[68], s[69], s[70]):
+            # input_shapes are all with extreme ratios between height/width:
+            pytest.xfail(reason="BinaryStack operator is not working for shapes that have extreme ratios between height/width")
     # ------------------------------------------------------------------------------------------------------------------------------------
 
 
@@ -419,12 +442,17 @@ def test_eltwise_binary_ops_per_test_plan(
     if model_type == ModelFromDramQueue:
         input_source_flag = InputSourceFlags.FROM_DRAM
 
+    kwargs={}
+    if input_operator == "BinaryStack":
+        kwargs['dim'] = -1
+
     verify(
         test_device=test_device,
         model_type=model_type,
         input_operator=input_operator,
         input_shape=input_shape,
         number_of_operands=2,
+        kwargs=kwargs,
         input_source_flag=input_source_flag,
         dev_data_format=dev_data_format,
         math_fidelity=input_math_fidelity,
@@ -451,7 +479,7 @@ def get_eltwise_binary_ops_prologued():
         pytest.param("Max"),              #01
         pytest.param("Min"),              #02
         pytest.param("Power",             #03
-                     marks=pytest.mark.xfail(reason="AssertionError: Data mismatch detected")),
+                     marks=pytest.mark.xfail(reason="Validation error caused by pcc threshold.")),
         pytest.param("Subtract"),         #04
         pytest.param("Multiply"),         #05
         pytest.param("Heaviside"),        #06
@@ -461,6 +489,8 @@ def get_eltwise_binary_ops_prologued():
         pytest.param("LessEqual"),        #10
         pytest.param("Equal"),            #11
         pytest.param("NotEqual"),         #12
+        pytest.param("Divide"),           #13
+        pytest.param("BinaryStack"),      #14
     ]
 
 def get_input_shapes_prologued():
@@ -518,12 +548,28 @@ def test_eltwise_binary_ops_per_test_plan_dram_prologued(
     input_math_fidelity=None
 ):
 
+    # Observed Bugs: --------------------------------------------------------------------------------------------------------------------
+    # 1. BinaryStack bugs:
+    if input_operator == "BinaryStack" and len(input_shape) in (2, 3):
+        # input_shapes are 2-dimensional and 3-dimensional:
+        pytest.xfail(reason="BinaryStack operator is not working for 2D and 3D shapes.")
+    # -----------------------------------------------------------------------------------------------------------------------------------
+
+    # Divide behaves differently from another operators for this shape
+    if input_operator == "Divide" and input_shape == (2, 100, 1000):
+        should_prolog = True
+
+    kwargs = {}
+    if input_operator == "BinaryStack":
+        kwargs['dim'] = -1
+
     verify(
         test_device=test_device,
         model_type=model_type,
         input_operator=input_operator,
         input_shape=input_shape,
         number_of_operands=1,
+        kwargs=kwargs,
         input_source_flag=input_source_flag,
         dev_data_format=dev_data_format,
         math_fidelity=input_math_fidelity,
@@ -625,6 +671,75 @@ def test_df_eltwise_binary_ops_per_test_plan(input_operator, model_type, test_de
     )
 
 
+# LogicalAnd operator:
+# Compile is failing, looks like it is not supported by the compiler yet.
+# Error Message: "Compile error: 'logical_and'"
+# ...
+# Error Message: "KeyError: 'logical_and'"
+@pytest.mark.xfail(reason="Not implemented")
+def test_eltwise_binary_logicaland_operator(test_device):
+
+    verify(
+        test_device=test_device,
+        model_type=ModelFromHost,
+        input_operator="LogicalAnd",
+        input_shape=[1, 3, 3],
+        number_of_operands=2,
+    )
+
+
+# It is not clear what the operator should do, because the documentation is missing - it is copied from Max operator.
+# Case with dim=-1 is covered with other operators in test "test_eltwise_binary_ops_per_test_plan".
+# This test covers all other values for dim parameter.
+@pytest.mark.xfail(reason="Operator is not working for dim parameter different than -1.")
+@pytest.mark.parametrize("shape", [(1, 3, 3, 3)])
+@pytest.mark.parametrize("dim", [-2, 0, 1, 2])
+@pytest.mark.parametrize("model", [ModelFromHost, ModelFromAnotherOp])
+def test_eltwise_binary_binarystack_operator(test_device, shape, dim, model):
+
+    kwargs={}
+    kwargs['dim'] = dim
+
+    verify(
+        test_device=test_device,
+        model_type=model,
+        input_operator="BinaryStack",
+        input_shape=shape,
+        number_of_operands=2,
+        kwargs=kwargs,
+    )
+
+
+# Test function for running single operator test with specific parameters
+# with all models except prologued
+@pytest.mark.skip
+def test_eltwise_binary_ops_per_test_plan_single(
+        bin_op,
+        bin_model,
+        bin_shape,
+        test_device
+):
+
+    model = eval(bin_model)
+    shape = eval(bin_shape) if type(bin_shape) is str else bin_shape
+    
+    test_eltwise_binary_ops_per_test_plan(bin_op, model, shape, test_device)
+
+
+# Test function for running single operator test with specific parameters
+# with prologued model
+@pytest.mark.skip
+def test_eltwise_binary_ops_per_test_plan_single_prologued(
+        bin_op,
+        bin_shape_prologued,
+        test_device
+):
+    model = ModelFromDramQueuePrologued
+    shape, source_flag, should_prolog = eval(bin_shape_prologued)
+
+    test_eltwise_binary_ops_per_test_plan_dram_prologued(bin_op, model, shape, source_flag, should_prolog, test_device)
+
+
 # ------------------------------------------------------------------------------------------------------------
 # Old test implementation using not simplified test models:
 # (These old tests are deactivated)

From cfab4d7ee0d8db4aa5466420581c656378343a4c Mon Sep 17 00:00:00 2001
From: Konstantin Milanovic <kmilanovic@tenstorrent.com>
Date: Mon, 1 Jul 2024 12:00:07 +0000
Subject: [PATCH 054/116] test all element-wise unary operators according to
 test plan

(cherry picked from commit 01976288007516a75f2bdfb719427dc902ddd38e)
---
 .../test/operators/eltwise_unary/conftest.py  |   3 +-
 .../models/test_plan/__init__.py              |   6 +
 .../test_plan/model_op_src_const_inputs1.py   |  51 ++
 .../test_plan/model_op_src_from_another_op.py |  43 ++
 .../test_plan/model_op_src_from_dram.py       |  43 ++
 .../test_plan/model_op_src_from_host.py       |  42 ++
 .../test_plan/model_op_src_from_tm_edge1.py   |  43 ++
 .../test_plan/model_op_src_from_tm_edge2.py   |  44 ++
 .../operators/eltwise_unary/test_command.sh   |  10 +-
 .../eltwise_unary/test_eltwise_unary.py       | 693 +++++++++++++++++-
 pybuda/test/operators/utils/utils.py          |   3 +-
 11 files changed, 977 insertions(+), 4 deletions(-)
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/__init__.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_const_inputs1.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_another_op.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_dram.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_host.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge1.py
 create mode 100644 pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge2.py

diff --git a/pybuda/test/operators/eltwise_unary/conftest.py b/pybuda/test/operators/eltwise_unary/conftest.py
index c4e500fc..b0e52b0c 100644
--- a/pybuda/test/operators/eltwise_unary/conftest.py
+++ b/pybuda/test/operators/eltwise_unary/conftest.py
@@ -68,7 +68,8 @@ def pytest_generate_tests(metafunc):
 
 	option_shape = metafunc.config.option.un_shape
 	if 'un_shape' in metafunc.fixturenames and option_shape is not None:
-		metafunc.parametrize("un_shape", [option_shape])
+		shape = eval(option_shape) if type(option_shape) == str else option_shape
+		metafunc.parametrize("un_shape", [shape])
 
 	option_op = metafunc.config.option.un_op
 	if 'un_op' in metafunc.fixturenames and option_op is not None:
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/__init__.py b/pybuda/test/operators/eltwise_unary/models/test_plan/__init__.py
new file mode 100644
index 00000000..62979be3
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/__init__.py
@@ -0,0 +1,6 @@
+from .model_op_src_const_inputs1 import BudaElementWiseUnaryTest
+from .model_op_src_from_another_op import BudaElementWiseUnaryTest
+from .model_op_src_from_dram import BudaElementWiseUnaryTest
+from .model_op_src_from_host import BudaElementWiseUnaryTest
+from .model_op_src_from_tm_edge1 import BudaElementWiseUnaryTest
+from .model_op_src_from_tm_edge2 import BudaElementWiseUnaryTest
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_const_inputs1.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_const_inputs1.py
new file mode 100644
index 00000000..54cccd43
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_const_inputs1.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import torch
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+from test.operators.utils.utils import ShapeUtils
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test _ Const Inputs _ const eval pass"
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: Const Inputs: (const eval pass)
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator " + opname + " test_Const Inputs_Const eval pass")
+        self.testname = "Element-wise unary operator " + opname + " test_Const Inputs_Const eval pass"
+        self.operator = operator
+        self.opname = opname
+        self.shape = ShapeUtils.reduce_microbatch_size(shape)
+        self.kwargs = kwargs
+
+        self.add_constant("c1")
+        self.set_constant("c1", pybuda.Tensor.create_from_torch(torch.rand(*self.shape, requires_grad=False), constant=True))
+
+    def forward(self, x):
+        un1 = self.operator(self.opname + "1", self.get_constant("c1"), **self.kwargs)
+        un2 = self.operator(self.opname + "2", x, **self.kwargs)
+        add1 = pybuda.op.Add("add1",un1, un2)
+        return add1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_another_op.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_another_op.py
new file mode 100644
index 00000000..f309f4a2
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_another_op.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test - from another op (calc)
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: From another op (calc)
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator " + opname + " test _ from another op _ calc")
+        self.testname = "Element-wise unary operator " + opname + " test _ from another op _ calc"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x):
+        add1 = pybuda.op.Add("add1", x, x)
+        un1 = self.operator(self.opname + "1", add1, **self.kwargs)
+        return un1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_dram.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_dram.py
new file mode 100644
index 00000000..02f95e4d
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_dram.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test - from dram queue - input_queue flag = false
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: From dram queue - input_queue flag = false
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+    
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator " + opname + " test _ from dram queue _ flag")
+        self.testname = "Element-wise unary operator " + opname + " test _ from dram queue _ flag"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x1):
+
+        un1 = self.operator(self.opname + "1", x1, **self.kwargs)
+        return un1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_host.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_host.py
new file mode 100644
index 00000000..c6a45475
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_host.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test - from host
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: From host
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator " + opname + " test _ from host")
+        self.testname = "Element-wise unary operator " + opname + " test _ from host"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x):
+        un1 = self.operator(self.opname + "1", x, **self.kwargs)
+        return un1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge1.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge1.py
new file mode 100644
index 00000000..5d045fda
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge1.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test - from tm edge
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: From tm edge: tm -> input
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator test _ from tm edge")
+        self.testname = "Element-wise unary operator test _ from tm edge"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+        
+    def forward(self, x):
+        tr1 = pybuda.op.Transpose("tr1", x, -1, -2)
+        un1 = self.operator(self.opname + "1", tr1, **self.kwargs)
+        return un1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge2.py b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge2.py
new file mode 100644
index 00000000..4ca11d05
--- /dev/null
+++ b/pybuda/test/operators/eltwise_unary/models/test_plan/model_op_src_from_tm_edge2.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+#   Matmul operator defined by PyBuda API
+#   These kinds of tests test only single specific operator through different PyBuda architectures
+# 
+
+
+import pybuda
+
+from pybuda import PyBudaModule
+
+
+class BudaElementWiseUnaryTest(PyBudaModule):
+    """
+        Element-wise unary operator test - from operator -> tm -> input
+
+        According to Test plan with this model we are testing:
+            1. Op type: One of the element-wise unary operator
+            2. Operand source: From tm edge: Combination: operator -> tm -> input
+            3. Operand shapes: All cases in combination with this operand source
+            4. Operand / output size of dimensions: All cases in combination with this operand source
+            5. /
+            6. /
+        
+    """
+
+    def __init__(self, operator, opname, shape, **kwargs):
+        super().__init__("Element-wise unary operator test - from operator_tm_input")
+        self.testname = "Element-wise unary operator test - from operator_tm_input"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x):
+        add1 = pybuda.op.Add("add1", x, x)
+        tr1 = pybuda.op.Transpose("tr1", add1, -1, -2)
+        un1 = self.operator(self.opname + "1", tr1, **self.kwargs)
+        return un1
+
+    def values(self):
+        return [item.value() for item in self.inputs]
diff --git a/pybuda/test/operators/eltwise_unary/test_command.sh b/pybuda/test/operators/eltwise_unary/test_command.sh
index 76a061d4..76b35334 100644
--- a/pybuda/test/operators/eltwise_unary/test_command.sh
+++ b/pybuda/test/operators/eltwise_unary/test_command.sh
@@ -31,4 +31,12 @@ pytest -svv test_eltwise_unary_single.py --un_model model_4 --un_train True --un
 
 
 # pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_6 --un_train True --un_recompute False --un_op 'Relu' --un_shape '[1, 12, 13]'
-# pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_7 --un_train True --un_recompute True --un_op 'Exp' --un_shape '[1, 12, 13]'
\ No newline at end of file
+# pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_7 --un_train True --un_recompute True --un_op 'Exp' --un_shape '[1, 12, 13]'
+
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary.py::test_eltwise_unary_ops_per_test_plan_single --un_model 'model_op_src_from_host' --un_shape '[1, 32, 96, 128]' --un_op 'Exp' --runxfail --no-skips
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary.py::test_eltwise_unary_ops_per_test_plan_pow_single --un_model 'model_op_src_from_host' --un_shape '[1, 32, 96, 128]' --un_kwargs_json='{"exponent": 0.54881352186203}' --runxfail --no-skips
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary.py::test_eltwise_unary_ops_per_test_plan_clip_single --un_model 'model_op_src_from_host' --un_shape '[1, 32, 96, 128]' --un_kwargs_json='{"min": 0.54881352186203, "max": 1}' --runxfail --no-skips
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary.py::test_eltwise_unary_ops_per_test_plan_cum_sum_single --un_model 'model_op_src_from_host' --un_shape '[1, 32, 96, 128]' --un_kwargs_json='{"exclusive": "False"}' --runxfail --no-skips
+
+
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary.py::test_eltwise_unary_ops_per_test_plan_clip_single --un_model 'model_op_src_from_host' --un_shape '[1, 32, 96, 128]' --un_kwargs_json='{"min": "None", "max": "None"}' --runxfail --no-skips
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
index 44e9778e..8b634d22 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
@@ -7,15 +7,706 @@
 # In this test we use pytorch tensors and operators to verify buda operators
 #
 
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (+)  2.1 From another op
+#       - Operator -> input
+# (+)  2.2 From tm edge
+#       - Combination: operator -> tm -> input
+#       - tm -> input
+# (+)  2.3 From DRAM queue
+#       - input_queue flag = false
+#       - Special case of From host? May it be triggered if the operator is not the first node of the network?
+#       - Can this be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# (/)  2.4 From DRAM, but prologued (constant)
+#       - Constants must be small enough to fit into L1
+#       - Verification via netlists that scenario is triggered
+#       - Input are not prologued for microbatch size = 1
+# (+)  2.5 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants. Does it make difference if tensor is big > L1
+#       - Verification via netlists that scenario is triggered???
+# (+)  2.6 From host
+#       - Input tensor as input of network -> Operator is first node in network and input_queue flag = true
+#       - Can this scenario be triggered from pybuda.Parameter?
+#       - Can this be triggered from big pybuda.Constant?
+# 3 Operand shapes type(s):
+# (+)  3.1 Full tensor (i.e. full expected shape)
+#       - Is 3 dims max for all ops? Ex. Conv is 3d max
+# (+)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (/)  3.3 Scalar
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (+)  4.1 Divisible by 32
+# (+)  4.2 Prime numbers
+# (+)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (+)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (/)  5.1 Output DF
+# (/)  5.2 Intermediate DF
+# (/)  5.3 Accumulation DF
+# (+)  5.4 Operand DFs
+# (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+# (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+
+# 10379 passed, 190 skipped, 4270 xfailed, 18 xpassed, 3 warnings in 2797.58s (0:46:37)
+
 import os
+from typing import Dict, List
 import pytest
 import numpy as np
+import math
 
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 from pybuda.verify.config import TestKind
 
+from pybuda.config import _get_global_compiler_config
+from pybuda.verify.backend import verify_module
+
+from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.conftest import TestDevice
+
+from pybuda.module import PyBudaModule
+
+from pybuda.op_repo.datatypes import TensorShape
+
 from . import models
+from .models import test_plan
+
+
+
+
+TEST_PLAN_MODELS_PATH = "./pybuda/test/operators/eltwise_unary/models/test_plan/"
+
+
+
+########## HELPER METHOD
+
+def verify(
+        test_device: TestDevice,
+        input_model: PyBudaModule, 
+        input_operator: str, 
+        input_shape: TensorShape, 
+        kwargs:Dict = {}, 
+        input_params: List[Dict] = [], 
+        input_dev_data_format: pybuda.DataFormat = None, 
+        input_math_fidelity: pybuda.MathFidelity = None, 
+        pcc: float = 0.99
+    ):
+    '''Common verification function for all tests'''
+    
+    architecture = f'test_plan.{input_model}.BudaElementWiseUnaryTest(operator=pybuda.op.{input_operator}, opname="{input_operator}", shape={input_shape}'
+    for k, v in kwargs.items():
+        architecture = f'{architecture}, {k}={v}'
+    architecture = f'{architecture})'
+    model = eval(architecture)
+
+    input_shapes = tuple([input_shape])
+    
+    if input_model == "model_op_src_from_dram":
+        CompilerUtils.set_input_source(InputSourceFlags.FROM_DRAM.value) 
+    elif input_model == "model_op_src_from_host":
+        CompilerUtils.set_input_source(InputSourceFlags.FROM_HOST.value) 
+        
+    if input_math_fidelity:
+        CompilerUtils.set_math_fidelity(input_math_fidelity)
+
+    if input_dev_data_format:
+        input_params.append({"dev_data_format": input_dev_data_format})
+
+    VerifyUtils.verify(model, test_device, input_shapes, input_params, pcc)
+
+    file_path = VerifyUtils.get_netlist_filename() 
+    match model:
+        case "model_op_src_from_dram":
+            assert netlist_utils.read_netlist_value(file_path, "/queues/x1/loc") == 'dram'
+        case "model_op_src_const_inputs1":
+            d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+            for key in d.keys():
+                assert input_operator not in key
+
+
+
+########## ALL UNARY OPERATORS THAT ARE TESTED
+
+def get_eltwise_unary_operators():
+    return [
+        "Abs",
+        "LeakyRelu",
+        "Exp",
+        "Identity",
+        "Reciprocal",
+        "Sigmoid",
+        "Sqrt",
+        "Gelu",
+        "Log",
+        "Relu",
+        "Buffer",
+        "Tanh",
+        "Sine",
+        "Cosine",
+        "Argmax",  
+        # "Dropout",      # have their own test
+        # "LogicalNot",   # have their own test
+        # "Tilize",       # have their own test 
+        # "Pow",          # have their own test
+        # "Clip",         # have their own test
+        # "CumSum",       # have their own test
+    ]
+
+
+
+
+########## ALL INPUT SHAPES USED FOR EACH OPERATOR IN TESTS
+
+def get_input_shapes():
+    return [
+        # 2 dim with microbatch size = 1
+            (1, 4),                         #0      # 3.1 Full tensor (i.e. full expected shape)
+            (1, 17),                        #1      # 3.1 Full tensor (i.e. full expected shape)
+            (1, 23),                        #2      # 3.2 Tensor reduce on one or more dims to 1
+            (1, 1),                         #3      # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100),                       #4      # 4.3 Very large (thousands, 10s of thousands)
+            (1, 500),                       #5      # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1000),                      #6      # 4.4 Extreme ratios between height/width
+            (1, 1920),                      #7      # 4.4 Extreme ratios between height/width  
+            (1, 10000),                     #8      # 4.4 Extreme ratios between height/width 
+            (1, 64),                        #9      # 4.1 Divisible by 32
+            (1, 96),                        #10     # 4.1 Divisible by 32
+            (1, 41),                        #11     # 4.2 Prime numbers
+            (1, 3),                         #12     # 4.2 Prime numbers
+            
+        # # 2 dim with microbatch size > 1
+            (3, 4),                         #13      # 3.1 Full tensor (i.e. full expected shape)              
+            (45, 17),                       #14      # 3.1 Full tensor (i.e. full expected shape)
+            (2, 23),                        #15      # 3.2 Tensor reduce on one or more dims to 1
+            (64, 1),                        #16      # 3.2 Tensor reduce on one or more dims to 1
+            (100, 100),                     #17      # 4.3 Very large (thousands, 10s of thousands)
+            (1000, 100),                    #18      # 4.3 Very large (thousands, 10s of thousands)
+            (10, 1000),                     #19      # 4.4 Extreme ratios between height/width
+            (9920, 1),                      #20      # 4.4 Extreme ratios between height/width  
+            (10000, 1),                     #21      # 4.4 Extreme ratios between height/width 
+            (32, 64),                       #22      # 4.1 Divisible by 32
+            (160, 96),                      #23      # 4.1 Divisible by 32
+            (17, 41),                       #24      # 4.2 Prime numbers
+            (89, 3),                        #25      # 4.2 Prime numbers
+
+        # 3 dim with microbatch size = 1
+            (1, 3, 4),                      #26     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 45, 17),                    #27     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 1, 23),                     #28     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 64, 1),                     #29     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100, 100),                  #30     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1000, 100),                 #31     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 10, 1000),                  #32     # 4.4 Extreme ratios between height/width
+            (1, 9920, 1),                   #33     # 4.4 Extreme ratios between height/width  
+            (1, 10000, 1),                  #34     # 4.4 Extreme ratios between height/width 
+            (1, 32, 64),                    #35     # 4.1 Divisible by 32
+            (1, 160, 96),                   #36     # 4.1 Divisible by 32
+            (1, 17, 41),                    #37     # 4.2 Prime numbers
+            (1, 89, 3),                     #38     # 4.2 Prime numbers
+
+        # 3 dim with microbatch size > 1
+            (2, 3, 4),                      #39     # 3.1 Full tensor (i.e. full expected shape)   
+            (11, 45, 17),                   #40     # 3.1 Full tensor (i.e. full expected shape)
+            (11, 1, 23),                    #41     # 3.2 Tensor reduce on one or more dims to 1
+            (11, 64, 1),                    #42     # 3.2 Tensor reduce on one or more dims to 1
+            (100, 100, 100),                #43     # 4.3 Very large (thousands, 10s of thousands)
+            (10, 1000, 100),                #44     # 4.3 Very large (thousands, 10s of thousands)
+            (2, 10, 1000),                  #45     # 4.4 Extreme ratios between height/width
+            (2, 9920, 1),                   #46     # 4.4 Extreme ratios between height/width  
+            (10, 10000, 1),                 #47     # 4.4 Extreme ratios between height/width 
+            (32, 32, 64),                   #48     # 4.1 Divisible by 32
+            (64, 160, 96),                  #49     # 4.1 Divisible by 32
+            (11, 17, 41),                   #50     # 4.2 Prime numbers
+            (13, 89, 3),                    #51     # 4.2 Prime numbers
+
+        # 4 dim with microbatch size = 1
+            (1, 2, 3, 4),                   #52     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 11, 45, 17),                #53     # 3.1 Full tensor (i.e. full expected shape)
+            (1, 11, 1, 23),                 #54     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 11, 64, 1),                 #55     # 3.2 Tensor reduce on one or more dims to 1
+            (1, 100, 100, 100),             #56     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 10, 1000, 100),             #57     # 4.3 Very large (thousands, 10s of thousands)
+            (1, 1, 10, 1000),               #58     # 4.4 Extreme ratios between height/width
+            (1, 1, 9920, 1),                #59     # 4.4 Extreme ratios between height/width  
+            (1, 10, 10000, 1),              #60     # 4.4 Extreme ratios between height/width 
+            (1, 32, 32, 64),                #61     # 4.1 Divisible by 32
+            (1, 64, 160, 96),               #62     # 4.1 Divisible by 32
+            (1, 11, 17, 41),                #63     # 4.2 Prime numbers
+            (1, 13, 89, 3),                 #64     # 4.2 Prime numbers
+
+        # 4 dim with microbatch size > 1
+            (3, 11, 45, 17),                #65     # 3.1 Full tensor (i.e. full expected shape)                  
+            (2, 2, 3, 4),                   #66     # 3.1 Full tensor (i.e. full expected shape)  
+            (4, 11, 1, 23),                 #67     # 3.2 Tensor reduce on one or more dims to 1  
+            (5, 11, 64, 1),                 #68     # 3.2 Tensor reduce on one or more dims to 1  
+            (6, 100, 100, 100),             #69     # 4.3 Very large (thousands, 10s of thousands)      
+            (7, 10, 1000, 100),             #70     # 4.3 Very large (thousands, 10s of thousands)      
+            (8, 1, 10, 1000),               #71     # 4.4 Extreme ratios between height/width      
+            (9, 1, 9920, 1),                #72     # 4.4 Extreme ratios between height/width        
+            (10, 10, 10000, 1),             #73     # 4.4 Extreme ratios between height/width       
+            (11, 32, 32, 64),               #74     # 4.1 Divisible by 32  
+            #Fatal Python error: Segmentation fault 
+            pytest.param((12, 64, 160, 96), marks=pytest.mark.skip(reason="Inference fail due to seg fault")), #75     # 4.1 Divisible by 32          
+            (13, 11, 17, 41),               #76     # 4.2 Prime numbers      
+            (14, 13, 89, 3),                #77     # 4.2 Prime numbers      
+    ]
+
+
+
+########## HELPER METHOD USED FOR ERROR SUMMARY 
+
+def xfail_test(input_operator, input_shape, input_model, input_kwargs):
+    s = get_input_shapes()
+    micro_batch_size = input_shape[0]
+    match input_operator:
+        case "Argmax":
+            if(len(input_shape) == 2 and micro_batch_size > 1 and input_model in ("model_op_src_from_another_op", "model_op_src_from_tm_edge2")):
+                # E           AssertionError: Error during inference
+                pytest.xfail("Inference failed")
+            elif(input_shape in ((s[16],) + (s[20],) + (s[21],)) and input_model == "model_op_src_from_tm_edge1"):
+                # E           AssertionError: Error during inference
+                pytest.xfail("Inference failed")
+            elif(input_shape in ((s[31],) + (s[33],) + (s[36],) + (s[44],) + (s[46],) + (s[49],)+ (s[56],) + (s[57],) + (s[59],) + tuple(s[60:63]) + (s[69],) + (s[70],) + tuple(s[72:75]))):
+                # E           RuntimeError: 1/2/3 Nodes have no valid grids, exiting
+                pytest.xfail("RuntimeError")
+        case "Dropout":
+            # Error message: E       AssertionError: Data mismatch detected
+            pytest.xfail("Data mismatch")
+        case "LogicalNot":
+            # Error message: E               KeyError: 'logical_not'
+            pytest.xfail("Not implemented operator")
+        case "Tilize":
+            # Error message: E       AttributeError: module 'torch' has no attribute 'tensors'
+            pytest.xfail("Inference failed")
+        case "CumSum":
+            if input_model in ("model_op_src_from_dram", "model_op_src_from_host", "model_op_src_from_another_op"):
+                # E               RuntimeError: Input operand not mapped to new graph during lowering: CumSum1
+                pytest.xfail("RuntimeError")
+            elif input_model in ("model_op_src_const_inputs1", "model_op_src_from_tm_edge1", "model_op_src_from_tm_edge2"):
+                # E               RuntimeError: TT_ASSERT @ pybuda/csrc/passes/lowering_context.cpp:28: old_node->node_type() != graphlib::NodeType::kPyOp
+                pytest.xfail("RuntimeError")
+        case "Pow":
+            if(micro_batch_size > 1):
+                if(input_kwargs['exponent'] not in (1000, 10000) and len(input_shape) == 2):
+                    # E           AssertionError: Error during inference
+                    pytest.xfail("Inference failed")
+                elif(input_kwargs['exponent'] == 1000):
+                    if(input_shape in (tuple(s[13:26]))):
+                        # E           AssertionError: Error during inference
+                        pytest.xfail('Inference failed')
+                    elif(input_model in ("model_op_src_from_host", "model_op_src_from_tm_edge1", "model_op_src_from_dram") and input_shape in ((s[39],) + (s[41],) + (s[66],))):
+                        # E           AssertionError: Data mismatch detected
+                        pytest.xfail('Data missmatch')
+                    elif(input_model in ("model_op_src_const_inputs1") and input_shape in (s[39],)):
+                        # E           AssertionError: Data mismatch detected
+                        pytest.xfail('Data missmatch')
+                elif(input_kwargs['exponent'] == 10000):
+                    if(input_shape in (tuple(s[13:26]))):
+                        # E           AssertionError: Error during inference
+                        pytest.xfail('Inference failed')
+                    elif(input_model in ("model_op_src_from_host", "model_op_src_from_tm_edge1", "model_op_src_from_dram") and input_shape in (tuple(s[39:52]) + tuple(s[65:69]) + tuple(s[71:75]) + tuple(s[76:78]))):
+                        # E           AssertionError: Data mismatch detected
+                        pytest.xfail('Data missmatch')
+                    elif(input_model in ("model_op_src_const_inputs1") and input_shape in ((s[39],) + (s[41],) + (s[66],))):
+                        # E           AssertionError: Data mismatch detected
+                        pytest.xfail('Data missmatch')
+            else:
+                match input_model:
+                    case "model_op_src_from_host":
+                        if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],)) ):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                        elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                    case "model_op_src_from_dram":
+                        if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                        elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                    case "model_op_src_const_inputs1":
+                        if (input_kwargs['exponent'] == 160 and input_shape in (s[3],)):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                        elif (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:2]) + (s[3],) + (s[12],) + (s[26],))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                        elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:4]) + tuple(s[11:13]) + (s[26],) + (s[28],) + (s[52],))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                    case "model_op_src_from_tm_edge1":
+                        if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                        elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
+                            # E           AssertionError: Data mismatch detected
+                            pytest.xfail('Data missmatch')
+                    case "model_op_src_from_another_op", "model_op_src_from_tm_edge2":
+                        return
+        case _:
+            if(len(input_shape) == 2 and micro_batch_size > 1):
+                # E           AssertionError: Error during inference
+                pytest.xfail('Inference failed')
+
+
+
+
+
+########## TEST ALL ELEMENT-WISE UNARY OPS
+
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+@pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("input_operator", get_eltwise_unary_operators())
+def test_eltwise_unary_ops_per_test_plan(
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    input_dev_data_format=None,
+    input_math_fidelity=None
+):
+    kwargs = {}
+    if input_operator == "LeakyRelu":
+        kwargs['alpha'] = np.random.rand()
+    xfail_test(input_operator, input_shape, input_model, kwargs)
+    verify(
+        input_model = input_model,
+        input_operator = input_operator,
+        input_shape = input_shape,
+        kwargs = kwargs,
+        input_dev_data_format = input_dev_data_format,
+        input_math_fidelity = input_math_fidelity, 
+        test_device = test_device, 
+    )
+# 5556 passed, 108 skipped, 2760 xfailed, 1 warning in 1103.93s (0:18:23)
+
+
+
+
+########## TEST ELEMENT-WISE UNARY OP - POW
+
+def get_pow_kwargs():
+    return [
+        # Error message: E                RuntimeError: TT_ASSERT @ pybuda/csrc/graph_lib/shape.cpp:34: values.size() >= BUDA_DIM_COUNT and values.size() <= BUDA_MAX_DIM_COUNT
+        # 18 are always xpassed
+        pytest.param(0.9336911808323198,    marks=pytest.mark.xfail(reason="RuntimeError")),  
+        0,
+        1,
+        2,
+        3,
+        47, 
+        160, 
+        1000, 
+        10000, 
+    ]
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+@pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("input_operator", ["Pow"])
+@pytest.mark.parametrize("input_kwargs", get_pow_kwargs())
+def test_eltwise_unary_ops_per_test_plan_pow(
+    input_kwargs,
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    input_dev_data_format=None,
+    input_math_fidelity=None
+):
+    kwargs = {}
+    kwargs['exponent'] = input_kwargs
+    xfail_test(input_operator, input_shape, input_model, kwargs)
+    verify(
+        input_model = input_model,
+        input_operator = input_operator,
+        input_shape = input_shape,
+        kwargs = kwargs,
+        input_dev_data_format = input_dev_data_format,
+        input_math_fidelity = input_math_fidelity, 
+        test_device = test_device, 
+    )
+# 2813 passed, 54 skipped, 1327 xfailed, 18 xpassed in 526.40s (0:08:46) 
+
+
+
+
+########## TEST ELEMENT-WISE UNARY OP - CLIP
+
+def get_clip_kwargs():
+    return [
+        # min < max
+        (0.4992656851851959, 0.9336911808323198),
+        # min > max
+        (0.9336911808323198, 0.4992656851851959),
+        (0.4992656851851959, None),
+        (None, 0.9336911808323198),
+        # Error message: E               RuntimeError: yaml-cpp: error at line 22, column 70: bad conversion
+        pytest.param(None, None,    marks=pytest.mark.xfail(reason="RuntimeError")),  
+    ]
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+@pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("input_operator", ["Clip"])
+@pytest.mark.parametrize("input_kwargs_min, input_kwargs_max", get_clip_kwargs())
+def test_eltwise_unary_ops_per_test_plan_clip(
+    input_kwargs_min,
+    input_kwargs_max,
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    input_dev_data_format=None,
+    input_math_fidelity=None
+):
+    kwargs = {}
+    kwargs['min'] = input_kwargs_min    
+    kwargs['max'] = input_kwargs_max
+    xfail_test(input_operator, input_shape, input_model, kwargs)
+    verify(
+        input_model = input_model,
+        input_operator = input_operator,
+        input_shape = input_shape,
+        kwargs = kwargs,
+        input_dev_data_format = input_dev_data_format,
+        input_math_fidelity = input_math_fidelity, 
+        test_device = test_device, 
+    )
+
+
+
+
+########## TEST ELEMENT-WISE UNARY OP - CumSum
+
+def get_cum_sum_kwargs_exclusive():
+    return [
+        False,
+        # Error message:E   Assertion error: Currently not supported
+        pytest.param(True,    marks=pytest.mark.xfail(reason="Unsupported parameter value"))
+    ]
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+@pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("input_operator", ["CumSum"])
+@pytest.mark.parametrize("input_kwargs_exclusive", get_cum_sum_kwargs_exclusive())
+def test_eltwise_unary_ops_per_test_plan_cum_sum(
+    input_kwargs_exclusive,
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    input_dev_data_format=None,
+    input_math_fidelity=None
+):
+    kwargs = {}
+    kwargs['axis'] = np.random.randint(0, len(input_shape))
+    kwargs['exclusive'] = input_kwargs_exclusive
+    xfail_test(input_operator, input_shape, input_model, kwargs)
+    verify(
+        input_model = input_model,
+        input_operator = input_operator,
+        input_shape = input_shape,
+        kwargs = kwargs,
+        input_dev_data_format = input_dev_data_format,
+        input_math_fidelity = input_math_fidelity, 
+        test_device = test_device, 
+    )
+# 12 skipped, 924 xfailed in 11.99s
+
+
+
+
+########## TEST ELEMENT-WISE UNARY OP - Dropout/LogicalNot/Tilize
+### tests for this ops are always failing, because of that, they are tested only in single combination of model and input shape
+### when they are fixed (bug reports: #2803, #2590, #2593) uncomment those 3 ops in function 'get_eltwise_unary_operators' and then run test named 'test_eltwise_unary_ops_per_test_plan' with flags: --runxfail --no-skips, so they can be tested in all needed combinations and uncomment next line:
+# @pytest.mark.skip
+@pytest.mark.parametrize("input_shape", [(1, 4)])
+@pytest.mark.parametrize("input_model", ["model_op_src_from_host"])
+@pytest.mark.parametrize("input_operator", ["Dropout", "LogicalNot", "Tilize"])
+def test_eltwise_unary_ops_per_test_plan_droput_logicalnot_tilize(
+    input_operator,
+    input_model,
+    input_shape,
+    test_device,
+    input_dev_data_format=None,
+    input_math_fidelity=None
+):
+    kwargs = {}
+    xfail_test(input_operator, input_shape, input_model, kwargs)
+    verify(
+        input_model = input_model,
+        input_operator = input_operator,
+        input_shape = input_shape,
+        kwargs = kwargs,
+        input_dev_data_format = input_dev_data_format,
+        input_math_fidelity = input_math_fidelity, 
+        test_device = test_device, 
+    )
+
+
+
+
+
+########## TEST DATA FORMAT AND MATH FIDELITY FOR ALL ELEMENT-WISE UNARY OPS
+
+# We will not test all combinations of Data Format and Math Fidelity because it would be too much tests. 
+#   1. First we will choose Data Format to be Float16_b and test all Math Fidelity values
+#   2. Then we will set Math Fidelity to HiFi4 and test all Data Formats. 
+
+def get_input_shape():
+    return  (1, 45, 17)     #0     # 3.1 Full tensor (i.e. full expected shape)
+
+dev_data_formats = [
+    pybuda.DataFormat.Float16_b,
+]
+
+compiler_math_fidelity = [
+                            pybuda.MathFidelity.LoFi,
+                            pybuda.MathFidelity.HiFi2,
+                            pybuda.MathFidelity.HiFi3,
+                            pybuda.MathFidelity.HiFi4,
+                         ]
+
+@pytest.mark.parametrize("input_operator", get_eltwise_unary_operators())
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_ops_mf_inputs(input_operator, test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan(input_operator, "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+#  60 passed, 12 xfailed in 8.12s 
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_pow_mf_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_pow(1, "Pow", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 4 passed in 1.55s 
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_clip_mf_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_clip(np.random.rand(), np.random.rand(), "Clip", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 4 passed in 1.67s 
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_cum_sum_mf_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_cum_sum(False, "CumSum", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+#  4 xfailed in 1.37s 
+
+
+
+dev_data_formats=[
+    pybuda.DataFormat.Bfp2,
+    pybuda.DataFormat.Bfp2_b,
+    pybuda.DataFormat.Bfp4,
+    pybuda.DataFormat.Bfp4_b,
+    pybuda.DataFormat.Bfp8,
+    pybuda.DataFormat.Bfp8_b,
+    pybuda.DataFormat.Float16,
+    pybuda.DataFormat.Float16_b,
+    pybuda.DataFormat.Float32,
+    pybuda.DataFormat.Int8,
+    pybuda.DataFormat.Lf8,
+    pybuda.DataFormat.RawUInt16,
+    pybuda.DataFormat.RawUInt32,
+    pybuda.DataFormat.RawUInt8,
+    pybuda.DataFormat.UInt16,
+]
+
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
+
+@pytest.mark.parametrize("input_operator", get_eltwise_unary_operators())
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_ops_df_inputs(input_operator, test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan(input_operator, "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 225 passed, 45 xfailed in 36.47s
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_pow_df_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_pow(1, "Pow", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 15 passed in 2.71s
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_clip_df_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_clip(np.random.rand(), np.random.rand(), "Clip", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 15 passed in 2.69s
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_eltwise_unary_op_cum_sum_df_inputs(test_device, dev_data_format, math_fidelity):
+    test_eltwise_unary_ops_per_test_plan_cum_sum(False, "CumSum", "model_op_src_from_host", get_input_shape(), test_device, dev_data_format, math_fidelity)
+# 15 xfailed in 1.47s
+
+
+
+
+
+
+########## SINGLE TEST FOR ALL ELEMENT-WISE UNARY OPS
+# run only from command line
+# used to reproduce bugs 
+
+@pytest.mark.skip
+def test_eltwise_unary_ops_per_test_plan_single(
+        un_op,
+        un_model,
+        un_shape,
+        test_device
+):
+    test_eltwise_unary_ops_per_test_plan(un_op, un_model, un_shape, test_device)
+
+@pytest.mark.skip
+def test_eltwise_unary_ops_per_test_plan_pow_single(
+        un_model,
+        un_shape,
+        un_kwargs,
+        test_device
+):
+    test_eltwise_unary_ops_per_test_plan_pow(un_kwargs['exponent'], "Pow", un_model, un_shape, test_device)
+
+@pytest.mark.skip
+def test_eltwise_unary_ops_per_test_plan_clip_single(
+        un_model,
+        un_shape,
+        un_kwargs,
+        test_device
+):
+    test_eltwise_unary_ops_per_test_plan_clip(un_kwargs['min'], un_kwargs['max'], "Clip", un_model, un_shape, test_device)
+
+@pytest.mark.skip
+def test_eltwise_unary_ops_per_test_plan_cum_sum_single(
+        un_model,
+        un_shape,
+        un_kwargs,
+        test_device
+):
+    test_eltwise_unary_ops_per_test_plan_cum_sum(un_kwargs['exclusive'], "CumSum", un_model, un_shape, test_device)
+
+
+
+
+
+
+
+
+
+
+
+#######################################################################################
+
+########## OLD TESTS 
+# those tests are skipped
 
 MODELS_PATH = "./pybuda/test/operators/eltwise_unary/models/"
 
@@ -53,7 +744,7 @@
 @pytest.mark.parametrize("operation", ["Abs", "LeakyRelu", "Exp", "Identity", "Reciprocal", "Sigmoid", "Sqrt", "Gelu", "Log", "Relu", "Buffer", "Tanh", "Dropout", "Sine", "Cosine", "Argmax", "Clip"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
 @pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
-def test_eltwise_unary(
+def obsoleted_test_eltwise_unary(
     op_test_kind,
     operation,
     model,
diff --git a/pybuda/test/operators/utils/utils.py b/pybuda/test/operators/utils/utils.py
index 47b09e9a..a62823a8 100644
--- a/pybuda/test/operators/utils/utils.py
+++ b/pybuda/test/operators/utils/utils.py
@@ -72,7 +72,7 @@ class VerifyUtils:
     '''Utility functions for PyBuda verification'''
 
     @staticmethod
-    def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[TensorShape], input_params: List[Dict] = []):
+    def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[TensorShape], input_params: List[Dict] = [], pcc = 0.99):
         '''Perform PyBuda verification on the model
 
         Args:
@@ -89,6 +89,7 @@ def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[Tens
                 test_kind=TestKind.INFERENCE,
                 devtype=test_device.devtype,
                 arch=test_device.arch,
+                pcc=pcc,
             ),
             input_params=[input_params],
         )

From e0e89ecd82a8fe5a721d25515d1e2a3f6ea2dec9 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 26 Jul 2024 18:59:41 +0000
Subject: [PATCH 055/116] [perf_wh] Fix perf Issue

(cherry picked from commit 76cb86eb087b260f40f31ba9b7095b1ead12ed18)
---
 pybuda/csrc/passes/commute_utils.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index 69d6e067..c9b57796 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -575,7 +575,11 @@ bool commute_through_reduce(
         prev_nodes = op_users;
     }
 
-    if (not commute_up and initial_op->op_name() == "transpose") {
+    // NOTE: We only allow this commute to happen if the transpose we are attempting to commute is Int32
+    // This is because this commute can reduce perf on some models. The reason we allow it if the df is
+    // Int32 is because Int32 transpose is not feasible on silicon, and so we must allow efforts to 
+    // commute these transposes out of quantized regions to go forward.
+    if (not commute_up and initial_op->op_name() == "transpose" and initial_op->output_df() == tt::DataFormat::Int32) {
         int dim0 = initial_op->op_type().get_attr_as<int>("dim0");
         int dim1 = initial_op->op_type().get_attr_as<int>("dim1");
 

From 08bb315e9ba33488cdfc2db00f68d534287ab61e Mon Sep 17 00:00:00 2001
From: Jovan Serbedzija <jserbedzija@tenstorrent.com>
Date: Sat, 27 Jul 2024 15:32:02 +0000
Subject: [PATCH 056/116] Remove whitespaces from test names in operator
 eltwise binary tests

(cherry picked from commit 5d764433094877c5ff1342f180327f231fc9f8f9)
---
 .../eltwise_binary/test_eltwise_binary.py     | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index 8804fb3b..89a2212c 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -85,8 +85,8 @@ class ModelFromAnotherOp(PyBudaModule):
     model_name = "model_op_src_from_another_op"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from another op")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from another op"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_another_op")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_another_op"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -105,8 +105,8 @@ class ModelFromHost(PyBudaModule):
     model_name = "model_op_src_from_host"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from host")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from host"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_host")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_host"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -122,8 +122,8 @@ class ModelFromDramQueue(PyBudaModule):
     model_name = "model_op_src_from_dram_queue"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_dram_queue")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_dram_queue"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -139,8 +139,8 @@ class ModelFromDramQueuePrologued(PyBudaModule):
     model_name = "model_op_src_from_dram_queue_prologued"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from dram queue prologued")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from dram queue prologued"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_dram_queue_prologued")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_dram_queue_prologued"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -164,8 +164,8 @@ class ModelConstEvalPass(PyBudaModule):
     model_name = "model_op_src_const_eval_pass"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src const eval pass")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src const eval pass"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_const_eval_pass")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_const_eval_pass"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -199,8 +199,8 @@ class ModelOpSrcFromTmEdge1(PyBudaModule):
     model_name = "model_op_src_from_tm_edge1"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge1")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge1"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_tm_edge1")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_tm_edge1"
         self.operator = operator
         self.opname = opname
         self.shape = shape
@@ -218,8 +218,8 @@ class ModelOpSrcFromTmEdge2(PyBudaModule):
     model_name = "model_op_src_from_tm_edge2"
 
     def __init__(self, operator, opname, shape, kwargs):
-        super().__init__("Element-wise binary operator " + opname + " test _ op src from tm edge2")
-        self.testname = "Element-wise binary operator " + opname + " test _ op src from tm edge2"
+        super().__init__("Element_wise_binary_operator_" + opname + "_test_op_src_from_tm_edge2")
+        self.testname = "Element_wise_binary_operator_" + opname + "_test_op_src_from_tm_edge2"
         self.operator = operator
         self.opname = opname
         self.shape = shape

From 661cfb26768e1de66548d01e2232296fe5d01dcf Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 18 Jul 2024 13:16:46 +0000
Subject: [PATCH 057/116] Extend VerifyUtils.verify

Extend VerifyUtils.verify with input source flag, math fidelity and data format

Issue #2554 / #2787

(cherry picked from commit 7474cddfb964050a455e8cbb8510f04302f78d62)
---
 .../eltwise_binary/test_eltwise_binary.py     | 21 +++++++-------
 .../eltwise_unary/test_eltwise_unary.py       | 28 +++++++++----------
 pybuda/test/operators/nary/test_stack.py      | 21 +++++++-------
 pybuda/test/operators/utils/utils.py          | 24 +++++++++++++++-
 4 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index 89a2212c..7f80533c 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -70,7 +70,7 @@
 
 from pybuda import PyBudaModule
 from pybuda.op_repo import TensorShape
-from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
 from test.conftest import TestDevice
 
@@ -253,16 +253,15 @@ def verify(
     input_shapes = tuple([input_shape for _ in range(number_of_operands)])
     logger.trace(f"***input_shapes: {input_shapes}")
 
-    if input_source_flag:
-        CompilerUtils.set_input_source(input_source_flag.value)
-
-    if math_fidelity:
-        CompilerUtils.set_math_fidelity(math_fidelity)
-
-    if dev_data_format:
-        input_params.append({"dev_data_format": dev_data_format})
-
-    VerifyUtils.verify(model, test_device, input_shapes, input_params)
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
 
 
 MODEL_TYPES = [
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
index 8b634d22..5394bd15 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
@@ -61,16 +61,12 @@
 from typing import Dict, List
 import pytest
 import numpy as np
-import math
 
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 from pybuda.verify.config import TestKind
 
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify.backend import verify_module
-
-from test.operators.utils import netlist_utils, InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.conftest import TestDevice
 
 from pybuda.module import PyBudaModule
@@ -110,18 +106,22 @@ def verify(
 
     input_shapes = tuple([input_shape])
     
+    input_source_flag = None
     if input_model == "model_op_src_from_dram":
-        CompilerUtils.set_input_source(InputSourceFlags.FROM_DRAM.value) 
+        input_source_flag = InputSourceFlags.FROM_DRAM
     elif input_model == "model_op_src_from_host":
-        CompilerUtils.set_input_source(InputSourceFlags.FROM_HOST.value) 
+        input_source_flag = InputSourceFlags.FROM_HOST
         
-    if input_math_fidelity:
-        CompilerUtils.set_math_fidelity(input_math_fidelity)
-
-    if input_dev_data_format:
-        input_params.append({"dev_data_format": input_dev_data_format})
-
-    VerifyUtils.verify(model, test_device, input_shapes, input_params, pcc)
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        pcc=pcc,
+        input_source_flag=input_source_flag,
+        dev_data_format=input_dev_data_format,
+        math_fidelity=input_math_fidelity,
+    )
 
     file_path = VerifyUtils.get_netlist_filename() 
     match model:
diff --git a/pybuda/test/operators/nary/test_stack.py b/pybuda/test/operators/nary/test_stack.py
index 1ca5c6da..3794b050 100644
--- a/pybuda/test/operators/nary/test_stack.py
+++ b/pybuda/test/operators/nary/test_stack.py
@@ -66,7 +66,7 @@
 
 from pybuda import PyBudaModule
 from pybuda.op_repo import TensorShape
-from test.operators.utils import InputSourceFlags, CompilerUtils, VerifyUtils
+from test.operators.utils import InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
 from test.operators.utils import NetlistValidation
 from test.conftest import TestDevice
@@ -78,16 +78,15 @@ def verify(model: PyBudaModule, test_device: TestDevice, input_shape: TensorShap
     input_shapes = tuple([input_shape for _ in range(number_of_operands)])
     logger.trace(f"***input_shapes: {input_shapes}")
 
-    if input_source_flag:
-        CompilerUtils.set_input_source(input_source_flag.value)
-
-    if math_fidelity:
-        CompilerUtils.set_math_fidelity(math_fidelity)
-
-    if dev_data_format:
-        input_params.append({"dev_data_format": dev_data_format})
-
-    VerifyUtils.verify(model, test_device, input_shapes, input_params)
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
 
 
 # Currently, verify_module for the Stack operator and Stack operator by it self
diff --git a/pybuda/test/operators/utils/utils.py b/pybuda/test/operators/utils/utils.py
index a62823a8..67204534 100644
--- a/pybuda/test/operators/utils/utils.py
+++ b/pybuda/test/operators/utils/utils.py
@@ -72,7 +72,16 @@ class VerifyUtils:
     '''Utility functions for PyBuda verification'''
 
     @staticmethod
-    def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[TensorShape], input_params: List[Dict] = [], pcc = 0.99):
+    def verify(
+        model: PyBudaModule,
+        test_device: TestDevice,
+        input_shapes: List[TensorShape],
+        input_params: List[Dict] = [],
+        pcc: Optional[float] = None,
+        input_source_flag: InputSourceFlags = None,
+        dev_data_format: pybuda.DataFormat = None,
+        math_fidelity: pybuda.MathFidelity = None,
+        ):
         '''Perform PyBuda verification on the model
 
         Args:
@@ -80,8 +89,21 @@ def verify(model: PyBudaModule, test_device: TestDevice, input_shapes: List[Tens
             test_device: TestDevice
             input_shapes: List of input shapes
             input_params: List of input parameters
+            pcc: PCC value for verification
+            input_source_flag: Input source flag
+            dev_data_format: Data format
+            math_fidelity: Math fidelity
         '''
 
+        if input_source_flag:
+            CompilerUtils.set_input_source(input_source_flag.value)
+
+        if math_fidelity:
+            CompilerUtils.set_math_fidelity(math_fidelity)
+
+        if dev_data_format:
+            input_params.append({"dev_data_format": dev_data_format})
+
         verify_module(
             model,
             input_shapes=input_shapes,

From 5be3a5657ac8eda421326d5fda81f3817c93e8e9 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Mon, 29 Jul 2024 14:20:01 +0000
Subject: [PATCH 058/116] Bringup QDQ mlp_mixer

(cherry picked from commit a0975402082a0e1077e7041df63ba748225e987a)
---
 pybuda/csrc/graph_lib/utils.cpp | 13 +++++++++---
 pybuda/pybuda/tvm_to_python.py  | 37 +++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/pybuda/csrc/graph_lib/utils.cpp b/pybuda/csrc/graph_lib/utils.cpp
index 49c9903a..dff84075 100644
--- a/pybuda/csrc/graph_lib/utils.cpp
+++ b/pybuda/csrc/graph_lib/utils.cpp
@@ -517,8 +517,6 @@ std::vector<Node *> topological_sort(const Graph &graph, std::function<bool(Node
 }
 
 void fork_subgraph(Graph *graph, Node *node) {
-    TT_ASSERT(graph->data_users(node).size() > 1, "Node only has one user, do not fork.");
-
     // If the node passed is an input node then just fork it
     graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(node);
     graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
@@ -526,7 +524,8 @@ void fork_subgraph(Graph *graph, Node *node) {
         input->get_consteval_graph(graph, true, true); // create graph before clone so input node name is correct
         std::vector<graphlib::Edge> user_edges = graph->user_data_edges(input);
         TT_ASSERT(graph->data_operands(input).size() == 0, "Input can't have operands");
-        for (int i = 1; i < (int)user_edges.size(); i++)
+        std::vector<graphlib::Node *> removed_to_forked;
+        for (int i = 0; i < (int)user_edges.size(); i++)
         {
             graphlib::Edge const &user_edge = user_edges[i];
             log_trace(
@@ -546,7 +545,15 @@ void fork_subgraph(Graph *graph, Node *node) {
             Edge new_user_edge = Edge(clone->id(), user_edge.producer_output_port_id, user_edge.consumer_node_id, user_edge.consumer_input_port_id, user_edge.edge_type);
             
             graph->add_edge(new_user_edge, attr);
+            removed_to_forked.push_back(clone);
         }
+
+        graphlib::Node *first_forked = removed_to_forked[0];
+        auto removed_node = graph->remove_node(input);
+
+        // Need to maintain original name because user can access it by name
+        first_forked->set_name(removed_node->name());
+        
     }
     else if (op) {
         std::vector<Edge> user_edges = graph->user_data_edges(op);
diff --git a/pybuda/pybuda/tvm_to_python.py b/pybuda/pybuda/tvm_to_python.py
index 6c3244d0..7b3c793d 100644
--- a/pybuda/pybuda/tvm_to_python.py
+++ b/pybuda/pybuda/tvm_to_python.py
@@ -640,7 +640,6 @@ def populate_conv2d_transpose_args(graph, nid, compiler_cfg):
             in_channel = input_node["attrs"]["shape"][0][0][0]
             break
     groups = int(node["attrs"]["groups"][0][0])
-    assert groups == 1 or (in_channel is not None and groups == in_channel), "Only supports group of 1 or in_channel"
     args.append(("groups", f"{groups}",))
 
     kernel_size = [int(kernel) for kernel in node["attrs"]["kernel_size"][0]]
@@ -1743,6 +1742,7 @@ def is_nop_reshape(nid):
         graph_input_names = {}
         params = {}
         constants = {}
+        removed_zp_nids = []
         ops = {}
         returns = {}
         returns_requiring_batch_dim_fix = []
@@ -1831,7 +1831,7 @@ def make_parser_friendly_name(node, node_type):
                                 f"Node: {nid} shape: {node['buda_shape']} name: {node['buda_name']} type: Constant"
                             )
 
-            elif node["op"] == "const":
+            elif node["op"] == "const" or node["op"] == "constant":
                 if isinstance(json_graph["params"][node["name"]], np.ndarray):
                     tensor = torch.from_numpy(json_graph["params"][node["name"]])
                 else:
@@ -1907,10 +1907,27 @@ def make_parser_friendly_name(node, node_type):
                 if node["name"] == "qnn.quantize":
                     assert int(node["attrs"]["num_inputs"]) == 3
                     zp_node = graph["nodes"][node["inputs"][2][0]]
-                    zp_node_name = zp_node['name']
-                    assert zp_node['nid'] in constants
-                    zp_value = json_graph["params"][zp_node_name]
-                    del constants[zp_node["nid"]]
+                    # In case tvm added an op (such as cast) between zp and dequantize
+                    if zp_node['op'] != 'constant':
+                        if 'inputs' in zp_node:
+                            zp_node_input = graph['nodes'][zp_node['inputs'][0][0]]
+                            if zp_node_input['op'] == 'constant':
+                                zp_node = zp_node_input
+                                zp_value = torch.tensor([0])
+                                if 'users' in zp_node:
+                                    users = zp_node['users']
+                                    for user in users:
+                                        if user in ops:
+                                            del ops[user]
+                    else:
+                        zp_node_name = zp_node['name']
+                        zp_value = json_graph["params"][zp_node_name]
+
+                    if zp_node['nid'] in constants:
+                        assert zp_node['nid'] not in removed_zp_nids
+                        del constants[zp_node["nid"]]
+
+                    removed_zp_nids.append(zp_node["nid"])
                     if zp_value.size == 1:
                         args.append(("zero_point", f"{float(zp_value.item())}"))
                     else:
@@ -1921,7 +1938,6 @@ def make_parser_friendly_name(node, node_type):
                     assert int(node["attrs"]["num_inputs"]) == 3
 
                     zp_node = graph["nodes"][node["inputs"][2][0]]
-                    
                     # In case tvm added an op (such as cast) between zp and dequantize
                     if zp_node['op'] != 'constant':
                         if 'inputs' in zp_node:
@@ -1938,8 +1954,11 @@ def make_parser_friendly_name(node, node_type):
                         zp_node_name = zp_node['name']
                         zp_value = json_graph["params"][zp_node_name]
 
-                    assert zp_node['nid'] in constants
-                    del constants[zp_node["nid"]]
+                    if zp_node['nid'] in constants:
+                        assert zp_node['nid'] not in removed_zp_nids
+                        del constants[zp_node["nid"]]
+
+                    removed_zp_nids.append(zp_node["nid"])
                     if zp_value.size == 1:
                         args.append(("zero_point", f"{zp_value.item()}"))
                     else:

From c42491efbc1284c67be4ba03a8e4cb92d4b421bc Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 31 Jul 2024 19:44:08 +0000
Subject: [PATCH 059/116] Assert that no module inputs are transformers Cache
 objects

(cherry picked from commit ffb1dd8f2675cc345a2a6798851cf3d2b79bebd3)
---
 pybuda/pybuda/tensor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pybuda/pybuda/tensor.py b/pybuda/pybuda/tensor.py
index ac545758..a4cfed64 100644
--- a/pybuda/pybuda/tensor.py
+++ b/pybuda/pybuda/tensor.py
@@ -15,6 +15,7 @@
 import jaxlib
 import jax.numpy as jnp
 import json
+import transformers
 
 from .pybudaglobal import TILE_DIM, align_up_tile, round_up_div
 from pybuda._C import DataFormat
@@ -1078,6 +1079,8 @@ def to_pt_tensors(tensors: Union[Tuple[Union[torch.Tensor, Tensor, tf.Tensor], .
             pytorch_tensors.append(torch.Tensor(t))
         elif isinstance(t, mxnet.ndarray.ndarray.NDArray):
             pytorch_tensors.append(torch.Tensor(t.asnumpy()))
+        elif isinstance(t, transformers.cache_utils.Cache):
+            raise RuntimeError(f"Unsupported input tensor type: {type(t)}. If you wish to use transformers past-cache, please use legacy cache.")
         elif isinstance(t, jaxlib.xla_extension.DeviceArray):
             pytorch_tensors.append(torch.Tensor(np.array(t)))
         else:

From 89ff70064e952bf298ccda712fefc61b7b7c6b0a Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Thu, 1 Aug 2024 10:46:15 +0000
Subject: [PATCH 060/116] Update tt-buda-demos track_pybuda branch

(cherry picked from commit 7c1598793c812c072f8f160fbb0078051932a0eb)
---
 third_party/buda-model-demos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/buda-model-demos b/third_party/buda-model-demos
index e3c6976f..86facd10 160000
--- a/third_party/buda-model-demos
+++ b/third_party/buda-model-demos
@@ -1 +1 @@
-Subproject commit e3c6976f4392b5e06f4e4c041a851d800fdcb353
+Subproject commit 86facd10ad329bdf8fe029dbd263b906af30c56c

From 14727b5f706f62120515e6ea322f2ac50a12368e Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 1 Aug 2024 16:08:47 +0000
Subject: [PATCH 061/116] Add pybuda test for phi2(n300&n150) - pytorch

(cherry picked from commit 2afd9fb4a4fff591996d556d5e16bcc294f9faf1)
---
 .../high_prio/nlp/pytorch/test_phi2.py        | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
new file mode 100644
index 00000000..bd51bf91
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
@@ -0,0 +1,69 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+import torch
+from transformers import PhiForCausalLM, AutoTokenizer, PhiConfig
+import os
+import pytest
+
+# Masked fill kernal produced invalid results in Silicon BackendType
+# So Disabling the verification in BBE for Silicon BackendType
+# Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
+
+variants = ["microsoft/phi-2", "microsoft/phi-2-pytdml"]
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_phi2_clm(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "20480"
+    compiler_cfg.amp_level = 1
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
+
+    # Load PhiConfig from pretrained variant, disable return_dict and caching.
+    config = PhiConfig.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = PhiConfig(**config_dict)
+
+    # Load model and tokenizer from HuggingFace
+    model = PhiForCausalLM.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    # input_prompt
+    input_prompt = "Write a detailed analogy between mathematics and a lighthouse."
+
+    # Tokenize input
+    inputs = tokenizer(
+        input_prompt,
+        return_tensors="pt",
+        max_length=256,
+        pad_to_max_length=True,
+        truncation=True,
+    )
+
+    input_ids = inputs["input_ids"].to(torch.int32)
+    attn_mask = inputs["attention_mask"].to(torch.float32)
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape,attn_mask.shape,)],
+        inputs=[(input_ids,attn_mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
+        ),
+    )

From 7265ad688387bb3b1afce80a6d938730c85ce309 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 2 Aug 2024 13:33:37 +0000
Subject: [PATCH 062/116] Bringup QDQ centernet and monodle

(cherry picked from commit 7f58e26075d86a07dcaf5bc078bf38bbb6fb3722)
---
 pybuda/csrc/passes/commute_utils.cpp        | 10 ++----
 pybuda/csrc/passes/insert_inverse_on_io.cpp |  3 +-
 pybuda/csrc/passes/make_quantized_ops.cpp   |  7 +++--
 pybuda/pybuda/op/eval/pybuda/convolution.py | 34 ++++++++++++---------
 pybuda/pybuda/op/eval/sparse_utils.py       |  5 ++-
 5 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index c9b57796..7d233207 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -926,10 +926,7 @@ bool commute_through_quantization(
         // check if axis moved to the right (or in the same place)
         while (new_axis < (int)commute_shape->size()) {
             if ((*commute_shape)[new_axis] == op->shape()[axis]) {
-                if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
-                    and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
-                    can_commute = true;
-                }
+                can_commute = true;
                 break;
             }
             new_axis++;
@@ -938,10 +935,7 @@ bool commute_through_quantization(
             new_axis = axis-1;
             while (new_axis >= 0) {
                 if ((*commute_shape)[new_axis] == op->shape()[axis]) {
-                    if (volume_above(commute_shape->as_vector(), new_axis) == volume_above(op->shape().as_vector(), axis)
-                        and volume_below(commute_shape->as_vector(), new_axis) == volume_below(op->shape().as_vector(), axis)) {
-                        can_commute = true;
-                    }
+                    can_commute = true;
                     break;
                 }
                 new_axis--;
diff --git a/pybuda/csrc/passes/insert_inverse_on_io.cpp b/pybuda/csrc/passes/insert_inverse_on_io.cpp
index bada13cd..7f859719 100644
--- a/pybuda/csrc/passes/insert_inverse_on_io.cpp
+++ b/pybuda/csrc/passes/insert_inverse_on_io.cpp
@@ -65,7 +65,8 @@ void add_inverse_to_input_edges(
         {
             if (clone_0_op->op_name() == "reshape")
                 try_consteval_op(graph, clone_0_op, true);
-            else if (clone_1_op->op_name() == "transpose" and not graph->enable_training() and not input->requires_grad())
+            // if training is enabled then only consteval if the input does not require grad
+            else if (clone_1_op->op_name() == "transpose" and (not graph->enable_training() or not input->requires_grad()))
                 try_consteval_op(graph, clone_0_op, true);
         }
     }
diff --git a/pybuda/csrc/passes/make_quantized_ops.cpp b/pybuda/csrc/passes/make_quantized_ops.cpp
index 4232b78d..f0dbb184 100644
--- a/pybuda/csrc/passes/make_quantized_ops.cpp
+++ b/pybuda/csrc/passes/make_quantized_ops.cpp
@@ -70,7 +70,7 @@ bool is_quantizeable_conv2d(graphlib::Graph *graph, graphlib::Node *conv2d) {
     if (not conv_op)
         return false;
 
-    if (conv_op->op_type().op != "conv2d")
+    if (conv_op->op_type().op != "conv2d" and conv_op->op_type().op != "conv2d_transpose")
         return false;
 
     // All inputs must be dequantize nodes
@@ -226,7 +226,7 @@ void make_quantized_add(graphlib::Graph *graph, graphlib::OpNode *add) {
 
 void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
     TT_ASSERT(conv2d, "Null OpNode pointer given.");
-    TT_ASSERT(conv2d->op_type().op == "conv2d", "OpNode is not conv2d");
+    TT_ASSERT(conv2d->op_type().op == "conv2d" or conv2d->op_type().op == "conv2d_transpose", "OpNode is not conv2d or conv2d_transpose");
     TT_ASSERT(is_quantizeable_conv2d(graph, conv2d), "conv2d is not quantizeable.");
 
     graphlib::OpNode *deq_act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
@@ -314,9 +314,10 @@ void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
     conv2d->set_output_df(DataFormat::Int32);
 }
 
-const std::array<std::string, 3> quantizeable_ops{
+const std::array<std::string, 4> quantizeable_ops{
     "matmul",
     "conv2d",
+    "conv2d_transpose",
     "add"
 };
 bool make_quantized_ops(graphlib::Graph *graph) {
diff --git a/pybuda/pybuda/op/eval/pybuda/convolution.py b/pybuda/pybuda/op/eval/pybuda/convolution.py
index f221a353..b43162ea 100644
--- a/pybuda/pybuda/op/eval/pybuda/convolution.py
+++ b/pybuda/pybuda/op/eval/pybuda/convolution.py
@@ -220,35 +220,35 @@ def rotate_convtranspose2d_weights(dc, weights, cin, cout, depthwise, groups, kH
     # Conv2dTranspose has shape (cin, cout, kH, kW), need to transpose (0,1) first
     # Note: weights for regular conv are (out_channels, in_channels/groups, kH, kW)
     # Note: weights for transpo conv are (in_channels, out_channels/groups, kH, kW)
-    weights = dc.op(TransposeTM.create(0, 1), [weights])
-    weights = dc.op("reshape", [weights], (1, cout, cin // groups, kH * kW))
-    weights = dc.op(TransposeTM.create(2, 3), [weights]) # Transpose weight
+    weights = dc.op(TransposeTM.create(0, 1), [weights], output_df=weights.output_df)
+    weights = dc.op("reshape", [weights], (1, cout, cin // groups, kH * kW), output_df=weights.output_df)
+    weights = dc.op(TransposeTM.create(2, 3), [weights], output_df=weights.output_df) # Transpose weight
 
     # Create weight dident to rotate last 2 dims by 180 degrees
     # eg. [[1,2] ,[3,4]] -> [[4,3] ,[2,1]]
     if cout > 1:
-        weights = dc.op("hstack", [weights], (cout,))
+        weights = dc.op("hstack", [weights], (cout,), output_df=weights.output_df)
     weight_dident = create_conv2d_transpose_weight_dident(kH, kW, tile_align=False).unsqueeze(0).unsqueeze(0)
     weight_dident_tensor = dc.tensor(weight_dident)
-    weights = dc.op("sparse_matmul", [weight_dident_tensor, weights])
+    weights = dc.op("sparse_matmul", [weight_dident_tensor, weights], output_df=weights.output_df)
 
     if cout > 1:
         row_after_hslice = weights.shape[-1] // cout
         if row_after_hslice % TILE_DIM != 0:
             orig_w_shape = weights.shape
-            weights = dc.op("reshape", [weights], (orig_w_shape[-4], orig_w_shape[-3], orig_w_shape[-2]*cout, row_after_hslice))
-            weights = dc.op("pad_tile", [weights], (-1, weights.shape[-1]))
-            weights = dc.op("reshape", [weights], (orig_w_shape[-4], orig_w_shape[-3], orig_w_shape[-2], align_up_tile(row_after_hslice)*cout))
-            weights = dc.op("hslice", [weights], (cout,))
-            weights = dc.op("narrow", [weights], (-1, 0, row_after_hslice, weights.shape[-1]))
+            weights = dc.op("reshape", [weights], (orig_w_shape[-4], orig_w_shape[-3], orig_w_shape[-2]*cout, row_after_hslice), output_df=weights.output_df)
+            weights = dc.op("pad_tile", [weights], (-1, weights.shape[-1]), output_df=weights.output_df)
+            weights = dc.op("reshape", [weights], (orig_w_shape[-4], orig_w_shape[-3], orig_w_shape[-2], align_up_tile(row_after_hslice)*cout), output_df=weights.output_df)
+            weights = dc.op("hslice", [weights], (cout,), output_df=weights.output_df)
+            weights = dc.op("narrow", [weights], (-1, 0, row_after_hslice, weights.shape[-1]), output_df=weights.output_df)
         else:
-            weights = dc.op("hslice", [weights], (cout,))
-    weights = dc.op(TransposeTM.create(2, 3), [weights]) # Transpose weight
+            weights = dc.op("hslice", [weights], (cout,), output_df=weights.output_df)
+    weights = dc.op(TransposeTM.create(2, 3), [weights], output_df=weights.output_df) # Transpose weight
     # Reshape into conv2d weight shape
     if depthwise:
-        weights = dc.op("reshape", [weights], (cin, cout // groups, kH, kW))
+        weights = dc.op("reshape", [weights], (cin, cout // groups, kH, kW), output_df=weights.output_df)
     else:
-        weights = dc.op("reshape", [weights], (cout // groups, cin, kH, kW))
+        weights = dc.op("reshape", [weights], (cout // groups, cin, kH, kW), output_df=weights.output_df)
 
     return weights
 
@@ -363,7 +363,11 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
                 # pickers are created row-major, starting from top-left kernel pixel
                 y_shift = ((kH - 1) // 2) - kY
                 x_shift = ((kW - 1) // 2) - kX
-                picker = create_conv2d_sparse_picker_matrix(y, x, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True, sparse_r_pad=padded_r, sparse_c_pad=padded_c)
+                if is_convtranspose2d:
+                    picker = create_conv2d_sparse_picker_matrix(y, x, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True, sparse_r_pad=padded_r, sparse_c_pad=padded_c, is_convtranspose2d=is_convtranspose2d, yout_transpose=yout_transpose, xout_transpose= xout_transpose)
+                else:
+                    picker = create_conv2d_sparse_picker_matrix(y, x, y_shift, x_shift, kH, kW, stride, padding, dilation, tile_align=True, sparse_r_pad=padded_r, sparse_c_pad=padded_c)
+
                 if is_convtranspose2d and stride_transpose > 1:
                     picker = torch.sparse.mm(picker, transpose_tensor)
                 pickers.append(picker)
diff --git a/pybuda/pybuda/op/eval/sparse_utils.py b/pybuda/pybuda/op/eval/sparse_utils.py
index f94fd596..e7183325 100644
--- a/pybuda/pybuda/op/eval/sparse_utils.py
+++ b/pybuda/pybuda/op/eval/sparse_utils.py
@@ -324,7 +324,7 @@ def create_conv2d_picker_matrix(y, x, y_shift, x_shift, stride, tile_align=False
 
 
 def create_conv2d_sparse_picker_matrix(
-    y, x, y_shift, x_shift, k_y, k_x, stride, padding, dilation, tile_align=False, pad_x_only=False, sparse_r_pad=0, sparse_c_pad=0
+    y, x, y_shift, x_shift, k_y, k_x, stride, padding, dilation, tile_align=False, pad_x_only=False, sparse_r_pad=0, sparse_c_pad=0, is_convtranspose2d=False, yout_transpose=None, xout_transpose=None
 ):
     cols = torch.arange(start=1, end=y * x + 1).view(y, x)
 
@@ -340,6 +340,9 @@ def create_conv2d_sparse_picker_matrix(
     out_y, out_x = calculate_conv2d_output_dimensions(
         y, x, [k_y, k_x], stride, padding, dilation
     )
+    if is_convtranspose2d:
+        out_y = yout_transpose
+        out_x = xout_transpose
 
     cols = torch.nn.functional.pad(
         cols, (0, out_x - cols.shape[1], 0, out_y - cols.shape[0])

From 545337c000c804b62925616681d79bf1d2285ab4 Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Tue, 6 Aug 2024 05:11:49 +0000
Subject: [PATCH 063/116] Add tests for pytorch binary operators

(cherry picked from commit 423365f0eb57ea89b04ace7826b9077651454770)
---
 .../eltwise_binary/__init__.py                |   3 +
 .../eltwise_binary/conftest.py                |  11 +
 .../eltwise_binary/test_pytorch_binary.py     | 429 ++++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100644 pybuda/test/operators_pytorch/eltwise_binary/__init__.py
 create mode 100644 pybuda/test/operators_pytorch/eltwise_binary/conftest.py
 create mode 100644 pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py

diff --git a/pybuda/test/operators_pytorch/eltwise_binary/__init__.py b/pybuda/test/operators_pytorch/eltwise_binary/__init__.py
new file mode 100644
index 00000000..2332467e
--- /dev/null
+++ b/pybuda/test/operators_pytorch/eltwise_binary/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
diff --git a/pybuda/test/operators_pytorch/eltwise_binary/conftest.py b/pybuda/test/operators_pytorch/eltwise_binary/conftest.py
new file mode 100644
index 00000000..134210f5
--- /dev/null
+++ b/pybuda/test/operators_pytorch/eltwise_binary/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", 'slow: marks tests as slow (deselect with -m "not slow")'
+    )
+    config.addinivalue_line(
+        "markers", 'run_in_pp: marks tests to run in pipeline'
+    )
diff --git a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
new file mode 100644
index 00000000..206f56f8
--- /dev/null
+++ b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
@@ -0,0 +1,429 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for testing of element-wise binary operators
+#
+# In this test we test pytorch binary operators
+
+
+import pytest
+
+from typing import List, Dict, Type
+from loguru import logger
+
+import torch
+import pybuda
+import pybuda.op
+
+from pybuda.op_repo import TensorShape
+from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
+from test.operators.utils import ShapeUtils
+from test.conftest import TestDevice
+
+
+class ModelFromAnotherOp(torch.nn.Module):
+
+    model_name = "model_op_src_from_another_op"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelFromAnotherOp, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_from_another_op"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x.retain_grad()
+        y.retain_grad()
+        # we use Add and Subtract operators to create two operands which are inputs for the binary operator
+        xx = torch.add(x, y)
+        yy = torch.sub(x, y)
+        output = self.operator(xx, yy, **self.kwargs)
+        return output
+
+
+class ModelFromHost(torch.nn.Module):
+
+    model_name = "model_op_src_from_host"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelFromHost, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_from_host"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x.retain_grad()
+        y.retain_grad()
+        output = self.operator(x, y, **self.kwargs)
+        return output
+
+
+class ModelFromDramQueue(torch.nn.Module):
+
+    model_name = "model_op_src_from_dram_queue"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelFromDramQueue, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_from_dram_queue"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x.retain_grad()
+        y.retain_grad()
+        output = self.operator(x, y, **self.kwargs)
+        return output
+
+
+class ModelConstEvalPass(torch.nn.Module):
+
+    model_name = "model_op_src_const_eval_pass"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelConstEvalPass, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_const_eval_pass"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+        
+        self.constant_shape = ShapeUtils.reduce_microbatch_size(shape)
+
+        # self.c1 = torch.rand(*self.constant_shape)
+        # self.c2 = torch.rand(*self.constant_shape)
+        self.c1 = (torch.rand(*self.constant_shape, requires_grad=False) - 0.5).detach()
+        self.c2 = (torch.rand(*self.constant_shape, requires_grad=False) - 0.5).detach()
+
+    def forward(self, x, y):
+        v1 = self.operator(self.c1, self.c2, **self.kwargs)
+        # v2 and v3 consume inputs
+        x.retain_grad()
+        y.retain_grad()
+        v2 = torch.add(x, y)
+        v3 = torch.add(v1, v2)
+        return v3
+
+
+class ModelOpSrcFromTmEdge1(torch.nn.Module):
+
+    model_name = "model_op_src_from_tm_edge1"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelOpSrcFromTmEdge1, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_from_tm_edge1"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x, y):
+        x.retain_grad()
+        y.retain_grad()
+        xx = torch.add(x, y)
+        yy = torch.transpose(xx, -1, -2)
+        output = self.operator(yy, yy, **self.kwargs)
+        return output
+
+
+class ModelOpSrcFromTmEdge2(torch.nn.Module):
+
+    model_name = "model_op_src_from_tm_edge2"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelOpSrcFromTmEdge2, self).__init__()
+        self.testname = "Element_wise_pytorch_binary_operator_" + opname + "_test_op_src_from_tm_edge2"
+        self.operator = operator
+        self.opname = opname
+        self.shape = shape
+        self.kwargs = kwargs
+
+    def forward(self, x, y):
+        x.retain_grad()
+        y.retain_grad()
+        xx = torch.transpose(x, -1, -2)
+        yy = torch.transpose(y, -1, -2)
+        output = self.operator(xx, yy, **self.kwargs)
+        return output
+
+
+def verify(
+    test_device: TestDevice,
+    model_type: Type[torch.nn.Module],
+    input_operator: str,
+    input_shape: TensorShape,
+    number_of_operands: int,
+    kwargs: Dict = {},
+    input_params: List[Dict] = [],
+    input_source_flag: InputSourceFlags = None,
+    dev_data_format: pybuda.DataFormat = None,
+    math_fidelity: pybuda.MathFidelity = None,
+):
+    '''Common verification function for all tests'''
+
+    operator = getattr(torch, input_operator)
+
+    pytorch_model = model_type(operator=operator, opname=input_operator, shape=input_shape, kwargs=kwargs)
+    pybuda_model = pybuda.PyTorchModule(pytorch_model.model_name, pytorch_model)
+
+    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
+    logger.trace(f"***input_shapes: {input_shapes}")
+
+    VerifyUtils.verify(
+        model=pybuda_model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
+
+
+MODEL_TYPES = [
+    # ModelFromAnotherOp,
+    ModelFromHost,
+    # ModelFromDramQueue,
+    # ModelConstEvalPass,
+    # ModelOpSrcFromTmEdge1,
+    # ModelOpSrcFromTmEdge2,
+]
+
+
+def get_eltwise_binary_ops():
+    return [
+        "add",                      #00
+        "div",                      #01
+        "divide",                   #02
+        "mul",                      #03
+        "multiply",                 #04
+        "sub",                      #05
+        "subtract",                 #06
+        "true_divide",              #07
+        "eq",                       #08
+        "ne",                       #09
+        "le",                       #10
+        "ge",                       #11
+        "greater",                  #12
+        "greater_equal",            #13
+        "gt",                       #14
+        "less_equal",               #15
+        "lt",                       #16
+        "less",                     #17
+        "maximum",                  #18
+        "minimum",                  #19
+        "not_equal",                #20
+    ]
+
+def get_input_shapes():
+    return [
+            # 2-dimensional shape, microbatch_size = 1:
+            pytest.param((1, 4),              marks=pytest.mark.run_in_pp),  #00      # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 17),             marks=pytest.mark.slow),       #01      # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 23),             marks=pytest.mark.slow),       #02      # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 1),              marks=pytest.mark.slow),       #03      # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 100),            marks=pytest.mark.slow),       #04      # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 500),            marks=pytest.mark.slow),       #05      # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 1000),           marks=pytest.mark.slow),       #06      # 4.4 Extreme ratios between height/width
+            pytest.param((1, 1920),           marks=pytest.mark.slow),       #07      # 4.4 Extreme ratios between height/width
+            pytest.param((1, 10000),          marks=pytest.mark.slow),       #08      # 4.4 Extreme ratios between height/width
+            pytest.param((1, 64),             marks=pytest.mark.run_in_pp),  #09      # 4.1 Divisible by 32
+            pytest.param((1, 96),             marks=pytest.mark.slow),       #10      # 4.1 Divisible by 32
+            pytest.param((1, 41),             marks=pytest.mark.slow),       #11      # 4.2 Prime numbers
+            pytest.param((1, 3),              marks=pytest.mark.slow),       #12      # 4.2 Prime numbers
+
+            # 2-dimensional shape, microbatch_size > 1:
+            # All shapes fails for all operators
+            pytest.param((3, 4),        #13      # 3.1 Full tensor (i.e. full expected shape)
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.run_in_pp]),
+            pytest.param((45, 17),      #14      # 3.1 Full tensor (i.e. full expected shape)
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((64, 1),       #15      # 3.2 Tensor reduce on one or more dims to 1
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((100, 100),    #16      # 4.3 Very large (thousands, 10s of thousands)
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((1000, 100),   #17      # 4.3 Very large (thousands, 10s of thousands)
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((10, 1000),    #18      # 4.4 Extreme ratios between height/width
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((9920, 1),     #19      # 4.4 Extreme ratios between height/width  
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((10000, 1),    #20      # 4.4 Extreme ratios between height/width 
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((32, 64),      #21      # 4.1 Divisible by 32
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((160, 96),     #22      # 4.1 Divisible by 32
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+            pytest.param((17, 41),      #23      # 4.2 Prime numbers
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.run_in_pp]),
+            pytest.param((89, 3),       #24      # 4.2 Prime numbers
+                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                                pytest.mark.slow]),
+
+            # 3-dimensional shape, microbatch_size = 1:
+            pytest.param((1, 3, 4),           marks=pytest.mark.run_in_pp),  #25     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 45, 17),         marks=pytest.mark.slow),       #26     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 1, 23),          marks=pytest.mark.slow),       #27     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 64, 1),          marks=pytest.mark.slow),       #28     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 100, 100),       marks=pytest.mark.slow),       #29     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 1000, 100),      marks=pytest.mark.slow),       #30     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 10, 1000),       marks=pytest.mark.slow),       #31     # 4.4 Extreme ratios between height/width
+            pytest.param((1, 9920, 1),        marks=pytest.mark.slow),       #32     # 4.4 Extreme ratios between height/width
+            pytest.param((1, 10000, 1),       marks=pytest.mark.slow),       #33     # 4.4 Extreme ratios between height/width 
+            pytest.param((1, 32, 64),         marks=pytest.mark.run_in_pp),  #34     # 4.1 Divisible by 32
+            pytest.param((1, 160, 96),        marks=pytest.mark.slow),       #35     # 4.1 Divisible by 32
+            pytest.param((1, 17, 41),         marks=pytest.mark.slow),       #36     # 4.2 Prime numbers
+            pytest.param((1, 89, 3),          marks=pytest.mark.slow),       #37     # 4.2 Prime numbers
+
+             # 3-dimensional shape, microbatch_size > 1:
+            pytest.param((2, 3, 4),           marks=pytest.mark.run_in_pp),  #38     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((11, 45, 17),        marks=pytest.mark.slow),       #39     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((11, 1, 23),         marks=pytest.mark.slow),       #40     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((11, 64, 1),         marks=pytest.mark.slow),       #41     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((100, 100, 100),     marks=pytest.mark.slow),       #42     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((10, 1000, 100),     marks=pytest.mark.slow),       #43     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((10, 10000, 1),      marks=pytest.mark.slow),       #44     # 4.4 Extreme ratios between height/width
+            pytest.param((32, 32, 64),        marks=pytest.mark.slow),       #45     # 4.1 Divisible by 32
+            pytest.param((64, 160, 96),       marks=pytest.mark.slow),       #46     # 4.1 Divisible by 32
+            pytest.param((11, 17, 41),        marks=pytest.mark.run_in_pp),  #47     # 4.2 Prime numbers
+            pytest.param((13, 89, 3),         marks=pytest.mark.slow),       #48     # 4.2 Prime numbers
+
+            # 4-dimensional shape, microbatch_size = 1:
+            pytest.param((1, 2, 3, 4),        marks=pytest.mark.run_in_pp),  #49     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 11, 45, 17),     marks=pytest.mark.slow),       #50     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((1, 11, 1, 23),      marks=pytest.mark.slow),       #51     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 11, 64, 1),      marks=pytest.mark.slow),       #52     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((1, 100, 100, 100),  marks=pytest.mark.slow),       #53     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 10, 1000, 100),  marks=pytest.mark.slow),       #54     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((1, 1, 10, 1000),    marks=pytest.mark.slow),       #55     # 4.4 Extreme ratios between height/width
+            pytest.param((1, 1, 9920, 1),     marks=pytest.mark.slow),       #56     # 4.4 Extreme ratios between height/width
+            pytest.param((1, 10, 10000, 1),   marks=pytest.mark.slow),       #57     # 4.4 Extreme ratios between height/width
+            pytest.param((1, 32, 32, 64),     marks=pytest.mark.run_in_pp),  #58     # 4.1 Divisible by 32
+            pytest.param((1, 64, 160, 96),    marks=pytest.mark.slow),       #59     # 4.1 Divisible by 32
+            pytest.param((1, 11, 17, 41),     marks=pytest.mark.slow),       #60     # 4.2 Prime numbers
+            pytest.param((1, 13, 89, 3),      marks=pytest.mark.slow),       #61     # 4.2 Prime numbers
+
+            # 4-dimensional shape, microbatch_size > 1:
+            pytest.param((3, 11, 45, 17),     marks=pytest.mark.run_in_pp),  #62     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((2, 2, 3, 4),        marks=pytest.mark.slow),       #63     # 3.1 Full tensor (i.e. full expected shape)
+            pytest.param((4, 11, 1, 23),      marks=pytest.mark.slow),       #64     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((5, 11, 64, 1),      marks=pytest.mark.slow),       #65     # 3.2 Tensor reduce on one or more dims to 1
+            pytest.param((6, 100, 100, 100),  marks=pytest.mark.slow),       #66     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((7, 10, 1000, 100),  marks=pytest.mark.slow),       #67     # 4.3 Very large (thousands, 10s of thousands)
+            pytest.param((8, 1, 10, 1000),    marks=pytest.mark.slow),       #68     # 4.4 Extreme ratios between height/width
+            pytest.param((9, 1, 9920, 1),     marks=pytest.mark.slow),       #69     # 4.4 Extreme ratios between height/width
+            pytest.param((10, 10, 10000, 1),  marks=pytest.mark.slow),       #70     # 4.4 Extreme ratios between height/width
+            pytest.param((11, 32, 32, 64),    marks=pytest.mark.slow),       #71     # 4.1 Divisible by 32
+            pytest.param((12, 64, 160, 96),                                  #72     # 4.1 Divisible by 32
+                                marks=pytest.mark.skip(reason="RuntimeError: Fatal Python error: Segmentation fault")),
+            pytest.param((13, 11, 17, 41),    marks=pytest.mark.run_in_pp),  #73     # 4.2 Prime numbers
+            pytest.param((14, 13, 89, 3),     marks=pytest.mark.slow),       #74     # 4.2 Prime numbers
+    ]
+
+@pytest.mark.parametrize("input_operator", get_eltwise_binary_ops())
+@pytest.mark.parametrize("model_type", MODEL_TYPES)
+@pytest.mark.parametrize("input_shape", get_input_shapes())
+def test_pytorch_eltwise_binary_ops_per_test_plan(
+    input_operator,
+    model_type,
+    input_shape,
+    test_device,
+    dev_data_format=None, 
+    input_math_fidelity=None
+):
+    
+    input_source_flag = None
+    if model_type == ModelFromDramQueue:
+        input_source_flag = InputSourceFlags.FROM_DRAM
+
+    verify(
+        test_device=test_device,
+        model_type=model_type,
+        input_operator=input_operator,
+        input_shape=input_shape,
+        number_of_operands=2,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=input_math_fidelity,
+    )
+
+    # netlist validations:
+
+    file_path = VerifyUtils.get_netlist_filename()
+
+    if model_type == ModelFromDramQueue:
+        assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
+        assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+
+    if model_type == ModelConstEvalPass:
+        # Here we check there is no key with operator name in the netlist in graphs section
+        d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+        for key in d.keys():
+            if key == "target_device":
+                continue
+            assert input_operator not in key
+
+
+def get_not_implemented_pytorch_binary_ops():
+    return [
+        "atan2",                    #00           - NotImplementedError: The following operators are not implemented: ['aten::atan2']
+        "arctan2",                  #01           - NotImplementedError: The following operators are not implemented: ['aten::atan2']
+        "bitwise_and",              #02           - RuntimeError: "bitwise_and_cpu" not implemented for 'Float'
+        "bitwise_or",               #03           - RuntimeError: "bitwise_or_cpu" not implemented for 'Float'
+        "bitwise_xor",              #04           - RuntimeError: "bitwise_xor_cpu" not implemented for 'Float'
+        "bitwise_left_shift",       #05           - RuntimeError: "lshift_cpu" not implemented for 'Float'
+        "bitwise_right_shift",      #06           - RuntimeError: "rshift_cpu" not implemented for 'Float'
+        "floor_divide",             #07           - AssertionError: Encountered unsupported op types. Check error logs for more details
+        "fmod",                     #08           - AssertionError: Encountered unsupported op types. Check error logs for more details
+        "logaddexp",                #09           - NotImplementedError: The following operators are not implemented: ['aten::logaddexp']
+        "logaddexp2",               #10           - NotImplementedError: The following operators are not implemented: ['aten::logaddexp2']
+        "nextafter",                #11           - NotImplementedError: The following operators are not implemented: ['aten::nextafter']
+        "remainder",                #12           - AssertionError: Encountered unsupported op types. Check error logs for more details
+        "fmax",                     #13           - NotImplementedError: The following operators are not implemented: ['aten::fmax']
+        "fmin",                     #14           - NotImplementedError: The following operators are not implemented: ['aten::fmin']
+    ]
+
+input_shapes=[
+    (1, 2, 3, 4),
+]
+
+
+@pytest.mark.parametrize("input_operator", get_not_implemented_pytorch_binary_ops())
+@pytest.mark.parametrize("model_type", MODEL_TYPES)
+@pytest.mark.parametrize("input_shape", input_shapes)
+@pytest.mark.xfail(reason="Skip not implemented operators")
+def test_not_implemented_pytorch_eltwise_binary_ops_per_test_plan(
+    input_operator,
+    model_type,
+    input_shape,
+    test_device,
+    dev_data_format=None, 
+    input_math_fidelity=None
+):
+
+    verify(
+        test_device=test_device,
+        model_type=model_type,
+        input_operator=input_operator,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=input_math_fidelity,
+    )

From c747331e386801502c99a21e72e9abbd7a8a9510 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Tue, 6 Aug 2024 13:31:08 +0000
Subject: [PATCH 064/116] Bringup QDQ Retinanet

(cherry picked from commit 24cf58c52aa43ebd7a6c32c5d3f6d66c6ddf6a92)
---
 .../passes/fuse_redundant_tm_sequence.hpp     | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
index 92d069b4..b9556ab1 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
@@ -200,6 +200,46 @@ namespace tt::passes
         OpTypeItem("transpose", {-2, -1, -1}, true),
     };
 
+    static TMPattern replace_4_0 = {
+        OpTypeItem("reshape", {1, 2, 2, 720}, false),
+    };
+
+    static TMPattern replace_4_1 = {
+        OpTypeItem("reshape", {1, 14, 14, 36}, false),
+    };
+
+    static TMPattern replace_4_2 = {
+        OpTypeItem("reshape", {1, 7, 7, 720}, false),
+    };
+
+    static TMPattern replace_4_3 = {
+        OpTypeItem("reshape", {1, 7, 7, 36}, false),
+    };
+    
+    static TMPattern replace_4_4 = {
+        OpTypeItem("reshape", {1, 4, 4, 720}, false),
+    };
+
+    static TMPattern replace_4_5 = {
+        OpTypeItem("reshape", {1, 4, 4, 36}, false),
+    };
+
+    static TMPattern replace_4_6 = {
+        OpTypeItem("reshape", {1, 14, 14, 720}, false),
+    };
+
+    static TMPattern replace_4_7 = {
+        OpTypeItem("reshape", {1, 2, 2, 36}, false),
+    };
+
+    static TMPattern replace_4_8 = {
+        OpTypeItem("reshape", {1, 28, 28, 720}, false),
+    };
+
+    static TMPattern replace_4_9 = {
+        OpTypeItem("reshape", {1, 28, 28, 36}, false),
+    };
+
     static TMPatternPairs pattern_map = {
         {pattern_0, replace_0},
         {pattern_1, replace_1},
@@ -229,6 +269,16 @@ namespace tt::passes
         {pattern_3, replace_3_7},
         {pattern_3, replace_3_8},
         {pattern_3, replace_3_9},
+        {pattern_4, replace_4_0},
+        {pattern_4, replace_4_1},
+        {pattern_4, replace_4_2},
+        {pattern_4, replace_4_3},
+        {pattern_4, replace_4_4},
+        {pattern_4, replace_4_5},
+        {pattern_4, replace_4_6},
+        {pattern_4, replace_4_7},
+        {pattern_4, replace_4_8},
+        {pattern_4, replace_4_9},
     };
 
     bool fuse_tm_sequences(tt::graphlib::Graph* graph, TMPatternPairs& pattern_map_ = pattern_map);

From 84701b659b1c1c52d58306345e670c2ccddb0c4d Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Tue, 6 Aug 2024 06:10:34 +0000
Subject: [PATCH 065/116] Remove tri basic 2 model file and modify the import
 in test

(cherry picked from commit 4f822919109933a625902d50b87d8f43eba681c9)
---
 .../model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
index e8485e8f..44890104 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -8,8 +8,10 @@
 
 import cv2
 import os
+import sys
+sys.path = list(set(sys.path + ["third_party/confidential_customer_models/internal/tri_basic_2/"]))
 
-from test.model_demos.models.tri_basic_2.model.semseg import resnet34_semseg
+from scripts.semseg import resnet34_semseg
 
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig

From 6bfe1d22f24fd3253cf66a2d5111763cbfa949c9 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Mon, 29 Jul 2024 11:44:45 +0000
Subject: [PATCH 066/116] Move FrameworkTestUtils

Refactor only

Issue #2755

(cherry picked from commit a638b81bbd263af16281a7326b95ad051d8015db)
---
 pybuda/test/random/rgg/__init__.py   |  2 ++
 pybuda/test/random/rgg/frameworks.py | 39 ++++++++++++++++++++++++++++
 pybuda/test/random/test_graphs.py    | 39 +---------------------------
 3 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index 8817247d..160ade4d 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -13,6 +13,7 @@
 from .base import Framework, GraphBuilder, ModelBuilder
 from .base import RandomizerRunner, RandomizerCodeGenerator, process_test
 from .frameworks import Frameworks
+from .frameworks import FrameworkTestUtils
 from .algorithms import GraphNodeSetup
 from .algorithms import RandomGraphAlgorithm
 
@@ -37,6 +38,7 @@
     "RandomizerCodeGenerator",
     "process_test",
     "Frameworks",
+    "FrameworkTestUtils"
     "GraphNodeSetup",
     "RandomGraphAlgorithm",
 ]
diff --git a/pybuda/test/random/rgg/frameworks.py b/pybuda/test/random/rgg/frameworks.py
index 88c81b9a..c1e1ee9e 100644
--- a/pybuda/test/random/rgg/frameworks.py
+++ b/pybuda/test/random/rgg/frameworks.py
@@ -6,6 +6,9 @@
 
 from enum import Enum
 
+from typing import Tuple
+from copy import copy
+
 from .base import Framework
 
 from .pybuda.model import PyBudaModelBuilder
@@ -13,6 +16,42 @@
 
 from pybuda.op_repo import pybuda_operator_repository
 from pybuda.op_repo import pytorch_operator_repository
+from pybuda.op_repo import OperatorDefinition
+
+
+class FrameworkTestUtils:
+
+    @classmethod
+    def copy_framework(cls, framework: Framework, skip_operators: Tuple[str] = []) -> Framework:
+        framework0 = framework
+        framework = copy(framework)
+        framework.operator_repository = copy(framework.operator_repository)
+        cls.skip_operators(framework, skip_operators)
+        assert len(framework.operator_repository.operators) + len(skip_operators) == len(framework0.operator_repository.operators), "Operators count should match after skipping operators"
+        return framework
+
+    @classmethod
+    def skip_operators(cls, framework: Framework, skip_operators: Tuple[str] = []) -> None:
+        initial_operator_count = len(framework.operator_repository.operators)
+        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name not in skip_operators]
+        assert len(framework.operator_repository.operators) + len(skip_operators) == initial_operator_count, "Operators count should match after skipping operators"
+
+    @classmethod
+    def allow_operators(cls, framework: Framework, allow_operators: Tuple[str] = []) -> None:
+        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name in allow_operators]
+        assert len(allow_operators) == len(framework.operator_repository.operators), "Operators count should match allowing skipping operators"
+
+    @classmethod
+    def copy_operator(cls, framework: Framework, operator_name: str) -> OperatorDefinition:
+        operators = framework.operator_repository.operators
+
+        i, operator = next(((i, operator) for i, operator in enumerate(operators) if operator.name == operator_name), (None, None))
+        if not operator:
+            return None
+
+        operator = copy(operator)
+        operators[i] = operator
+        return operator
 
 
 class Frameworks(Enum):
diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index e802880c..80a28c0a 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -6,54 +6,17 @@
 from enum import Enum
 import pytest
 
-from typing import Tuple
 from copy import copy
 
 from pybuda.op_repo import OperatorParamNumber
-from pybuda.op_repo import OperatorDefinition
 
-from test.random.rgg import Framework
 from test.random.rgg import Frameworks
+from test.random.rgg import FrameworkTestUtils
 from test.random.rgg import RandomGraphAlgorithm
 from test.random.rgg import RandomizerConfig
 from test.random.rgg import process_test
 
 
-class FrameworkTestUtils:
-
-    @classmethod
-    def copy_framework(cls, framework: Framework, skip_operators: Tuple[str] = []) -> Framework:
-        framework0 = framework
-        framework = copy(framework)
-        framework.operator_repository = copy(framework.operator_repository)
-        cls.skip_operators(framework, skip_operators)
-        assert len(framework.operator_repository.operators) + len(skip_operators) == len(framework0.operator_repository.operators), "Operators count should match after skipping operators"
-        return framework
-
-    @classmethod
-    def skip_operators(cls, framework: Framework, skip_operators: Tuple[str] = []) -> None:
-        initial_operator_count = len(framework.operator_repository.operators)
-        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name not in skip_operators]
-        assert len(framework.operator_repository.operators) + len(skip_operators) == initial_operator_count, "Operators count should match after skipping operators"
-
-    @classmethod
-    def allow_operators(cls, framework: Framework, allow_operators: Tuple[str] = []) -> None:
-        framework.operator_repository.operators = [op for op in framework.operator_repository.operators if op.name in allow_operators]
-        assert len(allow_operators) == len(framework.operator_repository.operators), "Operators count should match allowing skipping operators"
-
-    @classmethod
-    def copy_operator(cls, framework: Framework, operator_name: str) -> OperatorDefinition:
-        operators = framework.operator_repository.operators
-
-        i, operator = next(((i, operator) for i, operator in enumerate(operators) if operator.name == operator_name), (None, None))
-        if not operator:
-            return None
-
-        operator = copy(operator)
-        operators[i] = operator
-        return operator
-
-
 class FrameworksHealthy(Enum):
     ''' Adjust repositories to test healthy operators '''
 

From a8f39a4399b6f0466a808be8024f4ad241546aa5 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Mon, 29 Jul 2024 14:22:24 +0000
Subject: [PATCH 067/116] Extract FrameworksCustom

Refactor only

Issue #2755

(cherry picked from commit adabeba945749ae17d8662169e47090b6bdd04ec)
---
 pybuda/test/random/test_graphs.py | 86 +++++++++++++++++--------------
 1 file changed, 46 insertions(+), 40 deletions(-)

diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index 80a28c0a..716b87f5 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -52,6 +52,26 @@ def healty_pybuda():
 
         return framework
 
+    @staticmethod
+    def healty_pytorch():
+        SKIP_OPERATORS = (
+            "sqrt",  # skip because it's failing for negative values
+            # "linear",
+            "conv2d",  # skip until calc_input_shapes is properly implemented
+        )
+
+        framework = FrameworkTestUtils.copy_framework(Frameworks.PYTORCH.value, SKIP_OPERATORS)
+
+        return framework
+    
+    PYBUDA = healty_pybuda()
+    PYTORCH = healty_pytorch()
+
+
+class FrameworksCustom(Enum):
+    ''' Adjust repositories to prepare custom framework configurations '''
+
+
     @staticmethod
     def pybuda_matmul_joins():
         SKIP_OPERATORS = (
@@ -70,21 +90,7 @@ def pybuda_matmul_joins():
 
         return framework
 
-    @staticmethod
-    def healty_pytorch():
-        SKIP_OPERATORS = (
-            "sqrt",  # skip because it's failing for negative values
-            # "linear",
-            "conv2d",  # skip until calc_input_shapes is properly implemented
-        )
-
-        framework = FrameworkTestUtils.copy_framework(Frameworks.PYTORCH.value, SKIP_OPERATORS)
-
-        return framework
-    
-    PYBUDA = healty_pybuda()
     PYBUDA_MATMUL_JOINS = pybuda_matmul_joins()
-    PYTORCH = healty_pytorch()
 
 
 @pytest.mark.parametrize("framework", [
@@ -115,32 +121,6 @@ def test_random_graph_algorithm_pybuda(test_index, random_seeds, test_device, ra
     process_test("Default", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
 
 
-@pytest.mark.parametrize("framework", [
-    FrameworksHealthy.PYBUDA_MATMUL_JOINS.value,
-])
-def test_random_graph_algorithm_pybuda_matmul_joins(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
-    # adjust randomizer_config
-    randomizer_config = copy(randomizer_config)
-    # randomizer_config.debug_shapes = True
-    # randomizer_config.verify_shapes = True
-    randomizer_config.dim_min = 3
-    randomizer_config.dim_max = 4
-    randomizer_config.op_size_per_dim_min = 4
-    # randomizer_config.op_size_per_dim_min = 16
-    randomizer_config.op_size_per_dim_max = 8
-    # randomizer_config.op_size_per_dim_max = 64
-    # randomizer_config.op_size_per_dim_max = 256
-    randomizer_config.microbatch_size_min = 1
-    randomizer_config.microbatch_size_max = 8
-    randomizer_config.num_of_nodes_min = 10
-    randomizer_config.num_of_nodes_max = 15
-    randomizer_config.num_fork_joins_max = 10
-
-    # TODO random_seed instead of random_seeds
-    random_seed = random_seeds[test_index]
-    process_test("Matmul Joins", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
-
-
 @pytest.mark.parametrize("framework", [
     FrameworksHealthy.PYTORCH.value,
 ])
@@ -167,3 +147,29 @@ def test_random_graph_algorithm_pytorch(test_index, random_seeds, test_device, r
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
     process_test("Default", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+
+
+@pytest.mark.parametrize("framework", [
+    FrameworksCustom.PYBUDA_MATMUL_JOINS.value,
+])
+def test_random_graph_algorithm_pybuda_matmul_joins(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
+    # adjust randomizer_config
+    randomizer_config = copy(randomizer_config)
+    # randomizer_config.debug_shapes = True
+    # randomizer_config.verify_shapes = True
+    randomizer_config.dim_min = 3
+    randomizer_config.dim_max = 4
+    randomizer_config.op_size_per_dim_min = 4
+    # randomizer_config.op_size_per_dim_min = 16
+    randomizer_config.op_size_per_dim_max = 8
+    # randomizer_config.op_size_per_dim_max = 64
+    # randomizer_config.op_size_per_dim_max = 256
+    randomizer_config.microbatch_size_min = 1
+    randomizer_config.microbatch_size_max = 8
+    randomizer_config.num_of_nodes_min = 10
+    randomizer_config.num_of_nodes_max = 15
+    randomizer_config.num_fork_joins_max = 10
+
+    # TODO random_seed instead of random_seeds
+    random_seed = random_seeds[test_index]
+    process_test("Matmul Joins", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)

From 5545c866c84dfc6ec375939d64a696c1ea731b3f Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 30 Jul 2024 15:55:30 +0000
Subject: [PATCH 068/116] ShapeCalculationContext

Encapsulate shape calculation method parameters as
ShapeCalculationContext

Refactor only

Issue #2755

(cherry picked from commit 1764b870bfea0cc3333411c3a358d3c9ebc662ff)
---
 pybuda/pybuda/op_repo/__init__.py    |  2 ++
 pybuda/pybuda/op_repo/datatypes.py   | 27 +++++++++++++++++++--
 pybuda/pybuda/op_repo/shapes.py      | 16 ++++++++-----
 pybuda/test/random/rgg/__init__.py   |  2 ++
 pybuda/test/random/rgg/algorithms.py |  7 +++++-
 pybuda/test/random/rgg/datatypes.py  | 35 ++++++++++++++++++++++++----
 pybuda/test/random/rgg/utils.py      |  5 ++--
 7 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/pybuda/pybuda/op_repo/__init__.py b/pybuda/pybuda/op_repo/__init__.py
index 8a21de90..b89fd354 100644
--- a/pybuda/pybuda/op_repo/__init__.py
+++ b/pybuda/pybuda/op_repo/__init__.py
@@ -12,6 +12,7 @@
 
 
 from .datatypes import TensorShape, OperatorParam, OperatorParamNumber, OperatorDefinition, OperatorRepository
+from .datatypes import ShapeCalculationContext
 from .pybuda_operators import pybuda_operator_repository
 from .pytorch_operators import pytorch_operator_repository
 
@@ -21,6 +22,7 @@
     "OperatorParamNumber",
     "OperatorDefinition",
     "OperatorRepository",
+    "ShapeCalculationContext",
     "pybuda_operator_repository",
     "pytorch_operator_repository",
 ]
diff --git a/pybuda/pybuda/op_repo/datatypes.py b/pybuda/pybuda/op_repo/datatypes.py
index 4822924e..9d189ba5 100644
--- a/pybuda/pybuda/op_repo/datatypes.py
+++ b/pybuda/pybuda/op_repo/datatypes.py
@@ -5,7 +5,7 @@
 
 
 from random import Random
-from typing import List, Tuple, Optional, Callable, Type, Union
+from typing import List, Dict, Tuple, Optional, Callable, Type, Union
 from dataclasses import dataclass, field
 
 
@@ -34,7 +34,7 @@ class OperatorDefinition:
     forward_code: Optional[Callable[[], str]] = None
     forward_params: List[OperatorParam] = field(default_factory=list)
     operands: List[str] = field(default_factory=list)  # TODO describe operand and shapes
-    calc_input_shapes: Optional[Callable[["OperatorDefinition", TensorShape, Random], List[TensorShape]]] = None  # calculate input shapes from output shape
+    calc_input_shapes: Optional[Callable[["ShapeCalculationContext", Random], List[TensorShape]]] = None  # calculate input shapes from output shape
 
     @property
     def is_operator(self) -> bool:
@@ -45,6 +45,29 @@ def is_layer(self) -> bool:
         return self.instantiate
 
 
+class ShapeCalculationContext:
+
+    @property
+    def operator(self) -> OperatorDefinition:
+        raise NotImplementedError("Operator is not defined")
+
+    @property
+    def constructor_kwargs(self) -> Dict[str, object]:
+        raise NotImplementedError("constructor_kwargs is not defined")
+
+    @property
+    def forward_kwargs(self) -> Dict[str, object]:
+        raise NotImplementedError("forward_kwargs is not defined")
+
+    @property
+    def output_shape(self) -> TensorShape:
+        raise NotImplementedError("output_shape is not defined")
+
+    @property
+    def rng_shape(self) -> Random:
+        raise NotImplementedError("rng_shape is not defined")
+
+
 class OperatorRepository:
 
     def __init__(self, operators: List[OperatorDefinition]):
diff --git a/pybuda/pybuda/op_repo/shapes.py b/pybuda/pybuda/op_repo/shapes.py
index 775d1dc6..257d4cf9 100644
--- a/pybuda/pybuda/op_repo/shapes.py
+++ b/pybuda/pybuda/op_repo/shapes.py
@@ -7,16 +7,18 @@
 from random import Random
 from typing import List
 
-from .datatypes import OperatorDefinition
 from .datatypes import TensorShape
+from .datatypes import ShapeCalculationContext
 
 
-def same_input_shapes(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+def same_input_shapes(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+    operator, output_shape = calculation_context.operator, calculation_context.output_shape
     # each input operand has the same shape as the output
-    return [output_shape for _ in range(operator_definition.input_num)]
+    return [output_shape for _ in range(operator.input_num)]
 
 
-def linear_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+def linear_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
     # linear layer changes the last dimension of the input shape
     batch_shape = output_shape[:-1]
     n = output_shape[-1]
@@ -26,7 +28,8 @@ def linear_inputs(operator_definition: OperatorDefinition, output_shape: TensorS
 
 
 # FIXME: conv2d in PyTorch not working properly in all cases
-def conv2d_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+def conv2d_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
     shape1 = output_shape[:1]
     shape2 = output_shape[2:]
     n = output_shape[1]
@@ -35,7 +38,8 @@ def conv2d_inputs(operator_definition: OperatorDefinition, output_shape: TensorS
     return input_shapes
 
 
-def matmul_inputs(operator_definition: OperatorDefinition, output_shape: TensorShape, rng_shape: Random) -> List[TensorShape]:
+def matmul_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
     batch_shape = output_shape[:-2]
     m = output_shape[-2]
     n = output_shape[-1]
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index 160ade4d..ebc41e18 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -6,6 +6,7 @@
 from .datatypes import TensorShape
 from .datatypes import RandomizerConstantNode
 from .datatypes import RandomizerInputNode, RandomizerNode, ExecutionContext, RandomizerParameters, RandomizerGraph, RandomizerConfig
+from .datatypes import NodeShapeCalculationContext
 from .datatypes import RandomizerTestContext
 from .config import get_randomizer_config_default
 from .utils import StrUtils, GraphUtils
@@ -26,6 +27,7 @@
     "RandomizerParameters",
     "RandomizerGraph",
     "RandomizerConfig",
+    "NodeShapeCalculationContext",
     "RandomizerTestContext",
     "get_randomizer_config_default",
     "StrUtils",
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index f72ad8ac..07204fcd 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -10,6 +10,7 @@
 from pybuda.op_repo import OperatorDefinition
 
 from .datatypes import RandomizerGraph, RandomizerTestContext
+from .datatypes import NodeShapeCalculationContext
 from .datatypes import RandomizerInputNode
 from .datatypes import RandomizerConstantNode
 from .base import RandomizerNode, GraphBuilder
@@ -251,6 +252,9 @@ def build_graph(self, test_context: RandomizerTestContext):
         constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
         same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
 
+        # Context object for shape calculation, node will be set later in the loop
+        shape_calculation_context = NodeShapeCalculationContext(node=None, test_context=test_context)
+
         # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
         num_of_nodes = rng_graph.randint(self.randomizer_config.num_of_nodes_min, self.randomizer_config.num_of_nodes_max) 
         for node_index in range(num_of_nodes, 0, -1):
@@ -315,7 +319,8 @@ def build_graph(self, test_context: RandomizerTestContext):
             # Creating new node
             node = RandomizerNode(operator=op1, output_shape=output_shape)
             # Saving input shapes for the new node
-            node.input_shapes = NodeUtils.calc_input_shapes(node, rng_shape)
+            shape_calculation_context.node = node
+            node.input_shapes = NodeUtils.calc_input_shapes(node, shape_calculation_context)
 
             # Initializing default constructor parameters based on input and output shapes
             self._init_default_constructor_params(node)
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index afa2afbb..ef6a9ce6 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -4,19 +4,17 @@
 # Generic test model randomizer
 
 
-from typing import Dict, List, Optional, Final, Tuple
+from typing import Dict, List, Optional, Final
 from dataclasses import dataclass, field
 import random
 import torch
 
+from pybuda.op_repo import TensorShape
 from pybuda.op_repo import OperatorDefinition
+from pybuda.op_repo import ShapeCalculationContext
 from test.conftest import TestDevice
 
 
-# Defining a type for tensor shape
-TensorShape = Tuple[int, ...]
-
-
 @dataclass
 class RandomizerInputNode:
     constant: Final[bool] = field(default=False, init=False)
@@ -69,6 +67,33 @@ def node_info(self):
         return f"{self.node_name} {self.name}"
 
 
+class NodeShapeCalculationContext(ShapeCalculationContext):
+
+    def __init__(self, node: RandomizerNode, test_context: 'RandomizerTestContext'):
+        self.node = node
+        self.test_context = test_context
+
+    @property
+    def operator(self) -> OperatorDefinition:
+        return self.node.operator
+
+    @property
+    def constructor_kwargs(self) -> Dict[str, object]:
+        return self.node.constructor_kwargs
+
+    @property
+    def forward_kwargs(self) -> Dict[str, object]:
+        return self.node.forward_kwargs
+
+    @property
+    def output_shape(self) -> TensorShape:
+        return self.node.output_shape
+
+    @property
+    def rng_shape(self) -> random.Random:
+        return self.test_context.rng_shape
+
+
 @dataclass
 class ExecutionContext:
     values: Dict
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 860a8c60..bb7167e1 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -18,6 +18,7 @@
 
 from .datatypes import TensorShape
 from .datatypes import RandomizerConfig, RandomizerTestContext, RandomizerNode, RandomizerGraph
+from .datatypes import NodeShapeCalculationContext
 
 
 class StrUtils:
@@ -209,8 +210,8 @@ def get_open_nodes_with_input_shape(cls, nodes: List[RandomizerNode], input_shap
         return [node for node in nodes if cls.is_open(node) and cls.has_open_input_with_input_shape(node, input_shape)]
 
     @classmethod
-    def calc_input_shapes(cls, node: RandomizerNode, rng_shape: random.Random) -> List[TensorShape]:
-        return node.operator.calc_input_shapes(node.operator, node.output_shape, rng_shape)
+    def calc_input_shapes(cls, node: RandomizerNode, shape_calculation_context: NodeShapeCalculationContext) -> List[TensorShape]:
+        return node.operator.calc_input_shapes(shape_calculation_context)
 
 
 class DebugUtils:

From 0726d08a4619778e51a58af69313ff4332734c29 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Mon, 29 Jul 2024 12:18:47 +0000
Subject: [PATCH 069/116] Move shape calculation

Move shape calculations to test code.
Implicitly connect the shape calculation with the operator.

Refactor only

Issue #2755

(cherry picked from commit 4b0e23b2693acc0ebb04d41c6aeebcd4edd7bc8a)
---
 pybuda/pybuda/op_repo/pybuda_operators.py  | 78 +++++++++++-----------
 pybuda/pybuda/op_repo/pytorch_operators.py | 20 +++---
 pybuda/pybuda/op_repo/shapes.py            | 67 -------------------
 pybuda/test/random/rgg/frameworks.py       | 37 ++++++++--
 pybuda/test/random/rgg/shapes.py           | 70 +++++++++++++++++++
 5 files changed, 149 insertions(+), 123 deletions(-)
 delete mode 100644 pybuda/pybuda/op_repo/shapes.py
 create mode 100644 pybuda/test/random/rgg/shapes.py

diff --git a/pybuda/pybuda/op_repo/pybuda_operators.py b/pybuda/pybuda/op_repo/pybuda_operators.py
index d0e1d3d6..328226f4 100644
--- a/pybuda/pybuda/op_repo/pybuda_operators.py
+++ b/pybuda/pybuda/op_repo/pybuda_operators.py
@@ -6,62 +6,60 @@
 
 from .datatypes import OperatorDefinition, OperatorRepository
 from .datatypes import OperatorParamNumber
-from .shapes import same_input_shapes
-from .shapes import matmul_inputs
 
 
 # TODO describe operand and shapes
 _OPERATORS = [
 
     # Unary operators
-    OperatorDefinition("exp", "pybuda.op.Exp", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("reciprocal", "pybuda.op.Reciprocal", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("buffer", "pybuda.op.Buffer", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("sqrt", "pybuda.op.Sqrt", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("relu", "pybuda.op.Relu", 1, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("exp", "pybuda.op.Exp", 1),
+    OperatorDefinition("reciprocal", "pybuda.op.Reciprocal", 1),
+    OperatorDefinition("buffer", "pybuda.op.Buffer", 1),
+    OperatorDefinition("sqrt", "pybuda.op.Sqrt", 1),
+    OperatorDefinition("relu", "pybuda.op.Relu", 1),
     OperatorDefinition("leaky_relu", "pybuda.op.LeakyRelu", 1, forward_params=[
         OperatorParamNumber("alpha", float, 0, 100),
-    ], calc_input_shapes=same_input_shapes),
-    OperatorDefinition("nop", "pybuda.op.Identity", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("gelu", "pybuda.op.Gelu", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("log", "pybuda.op.Log", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("sigmoid", "pybuda.op.Sigmoid", 1, calc_input_shapes=same_input_shapes),
+    ]),
+    OperatorDefinition("nop", "pybuda.op.Identity", 1),
+    OperatorDefinition("gelu", "pybuda.op.Gelu", 1),
+    OperatorDefinition("log", "pybuda.op.Log", 1),
+    OperatorDefinition("sigmoid", "pybuda.op.Sigmoid", 1),
     OperatorDefinition("clip", "pybuda.op.Clip", 1, forward_params=[
         OperatorParamNumber("min", float, 0, 100),
         OperatorParamNumber("max", float, 0, 100),
-    ], calc_input_shapes=same_input_shapes),
-    OperatorDefinition("sine", "pybuda.op.Sine", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("cosine", "pybuda.op.Cosine", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("abs", "pybuda.op.Abs", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("tanh", "pybuda.op.Tanh", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("cumsum", "pybuda.op.CumSum", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("argmax", "pybuda.op.Argmax", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("logical_not", "pybuda.op.LogicalNot", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("dropout", "pybuda.op.Dropout", 1, calc_input_shapes=same_input_shapes),
+    ]),
+    OperatorDefinition("sine", "pybuda.op.Sine", 1),
+    OperatorDefinition("cosine", "pybuda.op.Cosine", 1),
+    OperatorDefinition("abs", "pybuda.op.Abs", 1),
+    OperatorDefinition("tanh", "pybuda.op.Tanh", 1),
+    OperatorDefinition("cumsum", "pybuda.op.CumSum", 1),
+    OperatorDefinition("argmax", "pybuda.op.Argmax", 1),
+    OperatorDefinition("logical_not", "pybuda.op.LogicalNot", 1),
+    OperatorDefinition("dropout", "pybuda.op.Dropout", 1),
     OperatorDefinition("pow", "pybuda.op.Pow", 1, forward_params=[
         OperatorParamNumber("exponent", float, 0, 100),
-    ], calc_input_shapes=same_input_shapes),
-    OperatorDefinition("tilizer", "pybuda.op.Tilize", 1, calc_input_shapes=same_input_shapes),
+    ]),
+    OperatorDefinition("tilizer", "pybuda.op.Tilize", 1),
 
     # Binary operators
-    OperatorDefinition("add", "pybuda.op.Add", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("divide", "pybuda.op.Divide", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("subtract", "pybuda.op.Subtract", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("multiply", "pybuda.op.Multiply", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("maximum", "pybuda.op.Max", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("minimum", "pybuda.op.Min", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("heaviside", "pybuda.op.Heaviside", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("binary_stack", "pybuda.op.BinaryStack", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("power", "pybuda.op.Power", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("greater", "pybuda.op.Greater", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("greater_equal", "pybuda.op.GreaterEqual", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("less", "pybuda.op.Less", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("less_equal", "pybuda.op.LessEqual", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("equal", "pybuda.op.Equal", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("not_equal", "pybuda.op.NotEqual", 2, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("logical_and", "pybuda.op.LogicalAnd", 2, calc_input_shapes=matmul_inputs),
+    OperatorDefinition("add", "pybuda.op.Add", 2),
+    OperatorDefinition("divide", "pybuda.op.Divide", 2),
+    OperatorDefinition("subtract", "pybuda.op.Subtract", 2),
+    OperatorDefinition("multiply", "pybuda.op.Multiply", 2),
+    OperatorDefinition("maximum", "pybuda.op.Max", 2),
+    OperatorDefinition("minimum", "pybuda.op.Min", 2),
+    OperatorDefinition("heaviside", "pybuda.op.Heaviside", 2),
+    OperatorDefinition("binary_stack", "pybuda.op.BinaryStack", 2),
+    OperatorDefinition("power", "pybuda.op.Power", 2),
+    OperatorDefinition("greater", "pybuda.op.Greater", 2),
+    OperatorDefinition("greater_equal", "pybuda.op.GreaterEqual", 2),
+    OperatorDefinition("less", "pybuda.op.Less", 2),
+    OperatorDefinition("less_equal", "pybuda.op.LessEqual", 2),
+    OperatorDefinition("equal", "pybuda.op.Equal", 2),
+    OperatorDefinition("not_equal", "pybuda.op.NotEqual", 2),
+    OperatorDefinition("logical_and", "pybuda.op.LogicalAnd", 2),
 
-    OperatorDefinition("matmul", "pybuda.op.Matmul", 2, calc_input_shapes=matmul_inputs),
+    OperatorDefinition("matmul", "pybuda.op.Matmul", 2),
 ]
 
 
diff --git a/pybuda/pybuda/op_repo/pytorch_operators.py b/pybuda/pybuda/op_repo/pytorch_operators.py
index ef78c878..fd917b85 100644
--- a/pybuda/pybuda/op_repo/pytorch_operators.py
+++ b/pybuda/pybuda/op_repo/pytorch_operators.py
@@ -6,10 +6,6 @@
 
 from .datatypes import OperatorDefinition, OperatorRepository
 from .datatypes import OperatorParamNumber
-from .shapes import same_input_shapes
-from .shapes import linear_inputs
-from .shapes import conv2d_inputs
-from .shapes import matmul_inputs
 
 
 # TODO describe operand and shapes
@@ -17,19 +13,19 @@
     OperatorDefinition("linear", "torch.nn.Linear", 1, instantiate=True, constructor_params=[
         OperatorParamNumber("in_features", int, 10, 50),
         OperatorParamNumber("out_features", int, 10, 50),
-    ], calc_input_shapes=linear_inputs),
+    ]),
     OperatorDefinition("conv2d", "torch.nn.Conv2d", 1, instantiate=True, constructor_params=[
         OperatorParamNumber("in_channels", int, 10, 50),
         OperatorParamNumber("out_channels", int, 10, 50),
         OperatorParamNumber("kernel_size", int, 3, 3),
         OperatorParamNumber("stride", int, 1, 1),
         OperatorParamNumber("padding", int, 1, 1),
-    ], calc_input_shapes=conv2d_inputs),
-    OperatorDefinition("relu", "torch.relu", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("sqrt", "torch.sqrt", 1, calc_input_shapes=same_input_shapes),
-    OperatorDefinition("tanh", "torch.tanh", 1, calc_input_shapes=same_input_shapes),
+    ]),
+    OperatorDefinition("relu", "torch.relu", 1),
+    OperatorDefinition("sqrt", "torch.sqrt", 1),
+    OperatorDefinition("tanh", "torch.tanh", 1),
     # OperatorDefinition("add", "torch.add", 1),
-    OperatorDefinition("add", "torch.add", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("add", "torch.add", 2),
 
     # Non-linear activation functions
     # HARDTANH = OperatorDefinition("hardtanh", 1)
@@ -65,8 +61,8 @@
     # LOCAL_RESPONSE_NORM = OperatorDefinition("local_response_norm", 1)
     # NORMALIZE = OperatorDefinition("normalize", 1)
 
-    OperatorDefinition("matmul", "torch.matmul", 2, calc_input_shapes=matmul_inputs),
-    OperatorDefinition("eltwise", "torch.add", 2, calc_input_shapes=same_input_shapes),
+    OperatorDefinition("matmul", "torch.matmul", 2),
+    OperatorDefinition("eltwise", "torch.add", 2),
 ]
 
 
diff --git a/pybuda/pybuda/op_repo/shapes.py b/pybuda/pybuda/op_repo/shapes.py
deleted file mode 100644
index 257d4cf9..00000000
--- a/pybuda/pybuda/op_repo/shapes.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-# Calculation of input shapes from output shapes for the specified operator
-
-
-from random import Random
-from typing import List
-
-from .datatypes import TensorShape
-from .datatypes import ShapeCalculationContext
-
-
-def same_input_shapes(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-    operator, output_shape = calculation_context.operator, calculation_context.output_shape
-    # each input operand has the same shape as the output
-    return [output_shape for _ in range(operator.input_num)]
-
-
-def linear_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
-    # linear layer changes the last dimension of the input shape
-    batch_shape = output_shape[:-1]
-    n = output_shape[-1]
-    n = randomize_size(n, rng_shape)
-    input_shapes = [batch_shape + (n,)]
-    return input_shapes
-
-
-# FIXME: conv2d in PyTorch not working properly in all cases
-def conv2d_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
-    shape1 = output_shape[:1]
-    shape2 = output_shape[2:]
-    n = output_shape[1]
-    n = randomize_size(n, rng_shape)
-    input_shapes = [shape1 + (n,) + shape2]
-    return input_shapes
-
-
-def matmul_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-    output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
-    batch_shape = output_shape[:-2]
-    m = output_shape[-2]
-    n = output_shape[-1]
-    # calculates inner dimension based on one of output shape dimensions
-    q = randomize_size(n, rng_shape)
-    input_shapes = [batch_shape + (m,q), batch_shape + (q,n)]
-    return input_shapes
-
-
-def randomize_size(n: int, rng_shape: Random) -> int:
-    '''Randomize size of an dimension based on size of another dimension.
-    Returns a random integer in the range [n/2, 3n/2] inclusive to keep the size of the dimension in a similar range.
-
-    Args:
-        n: size of an dimension
-        rng_shape: random number generator
-
-    Returns:
-        int: random size of an dimension
-    '''
-    range = n // 2
-    diff = rng_shape.randint(-1 * range, max(range, 1))
-    new_value = n + diff
-    # logger.trace(f"Randomize size: {n} + {diff} -> {new_value}")
-    return new_value
diff --git a/pybuda/test/random/rgg/frameworks.py b/pybuda/test/random/rgg/frameworks.py
index c1e1ee9e..d8bbbbcb 100644
--- a/pybuda/test/random/rgg/frameworks.py
+++ b/pybuda/test/random/rgg/frameworks.py
@@ -6,10 +6,12 @@
 
 from enum import Enum
 
-from typing import Tuple
+from loguru import logger
+from typing import Tuple, Type
 from copy import copy
 
-from .base import Framework
+from .base import Framework, ModelBuilder
+from .shapes import OperatorShapes
 
 from .pybuda.model import PyBudaModelBuilder
 from .pytorch.model import PyTorchModelBuilder
@@ -17,6 +19,7 @@
 from pybuda.op_repo import pybuda_operator_repository
 from pybuda.op_repo import pytorch_operator_repository
 from pybuda.op_repo import OperatorDefinition
+from pybuda.op_repo import OperatorRepository
 
 
 class FrameworkTestUtils:
@@ -53,16 +56,42 @@ def copy_operator(cls, framework: Framework, operator_name: str) -> OperatorDefi
         operators[i] = operator
         return operator
 
+    @classmethod
+    def set_calc_input_shapes(cls, framework: Framework, allow_operators: Tuple[str] = []) -> None:
+        ''' Implicitly set calc_input_shapes for all operators in the framework '''
+        logger.debug(f"Setting calc_input_shapes for framework {framework.framework_name}")
+        for operator in framework.operator_repository.operators:
+            function_name = f"{operator.name}_inputs"
+            if function_name in OperatorShapes.__dict__:
+                logger.debug(f"Found method {function_name} for {operator.name}")
+                operator.calc_input_shapes = OperatorShapes.__dict__[function_name]
+            else:
+                operator.calc_input_shapes = OperatorShapes.same_input_shapes
+
 
 class Frameworks(Enum):
     ''' Register of all frameworks '''
 
-    PYBUDA = Framework(
+    @staticmethod
+    def build_framework(framework_name: str, ModelBuilderType: Type[ModelBuilder], operator_repository: OperatorRepository):
+        framework = Framework(
+            framework_name=framework_name,
+            ModelBuilderType=ModelBuilderType,
+            operator_repository=operator_repository,
+        )
+
+        framework = FrameworkTestUtils.copy_framework(framework=framework, skip_operators=())
+
+        FrameworkTestUtils.set_calc_input_shapes(framework)
+
+        return framework
+
+    PYBUDA = build_framework(
         framework_name="PyBuda",
         ModelBuilderType=PyBudaModelBuilder,
         operator_repository=pybuda_operator_repository,
     )
-    PYTORCH = Framework(
+    PYTORCH = build_framework(
         framework_name="PyTorch",
         ModelBuilderType=PyTorchModelBuilder,
         operator_repository=pytorch_operator_repository,
diff --git a/pybuda/test/random/rgg/shapes.py b/pybuda/test/random/rgg/shapes.py
new file mode 100644
index 00000000..88a4c93f
--- /dev/null
+++ b/pybuda/test/random/rgg/shapes.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+# Calculation of input shapes from output shapes for the specified operator
+
+
+from random import Random
+from typing import List
+
+from .datatypes import TensorShape
+from .datatypes import ShapeCalculationContext
+
+
+class OperatorShapes:
+
+    @staticmethod
+    def same_input_shapes(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        operator, output_shape = calculation_context.operator, calculation_context.output_shape
+        # each input operand has the same shape as the output
+        return [output_shape for _ in range(operator.input_num)]
+
+    @staticmethod
+    def linear_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        # linear layer changes the last dimension of the input shape
+        batch_shape = output_shape[:-1]
+        n = output_shape[-1]
+        n = randomize_size(n, rng_shape)
+        input_shapes = [batch_shape + (n,)]
+        return input_shapes
+
+    # FIXME: conv2d in PyTorch not working properly in all cases
+    @staticmethod
+    def conv2d_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        shape1 = output_shape[:1]
+        shape2 = output_shape[2:]
+        n = output_shape[1]
+        n = randomize_size(n, rng_shape)
+        input_shapes = [shape1 + (n,) + shape2]
+        return input_shapes
+
+    @staticmethod
+    def matmul_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        batch_shape = output_shape[:-2]
+        m = output_shape[-2]
+        n = output_shape[-1]
+        # calculates inner dimension based on one of output shape dimensions
+        q = randomize_size(n, rng_shape)
+        input_shapes = [batch_shape + (m,q), batch_shape + (q,n)]
+        return input_shapes
+
+
+def randomize_size(n: int, rng_shape: Random) -> int:
+    '''Randomize size of an dimension based on size of another dimension.
+    Returns a random integer in the range [n/2, 3n/2] inclusive to keep the size of the dimension in a similar range.
+
+    Args:
+        n: size of an dimension
+        rng_shape: random number generator
+
+    Returns:
+        int: random size of an dimension
+    '''
+    range = n // 2
+    diff = rng_shape.randint(-1 * range, max(range, 1))
+    new_value = n + diff
+    # logger.trace(f"Randomize size: {n} + {diff} -> {new_value}")
+    return new_value

From 69fe1303e0d322c878e34948e61137b20c34f4fd Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 6 Aug 2024 08:23:45 +0000
Subject: [PATCH 070/116] Fix RateLimitter max limit

(cherry picked from commit 7711dde5ffc1760f0f7889773be821ffd9f07694)
---
 pybuda/test/random/rgg/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index bb7167e1..361231e1 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -240,7 +240,7 @@ def __init__(self, rng: random.Random, max_limit: int, current_limit: int):
 
     def is_allowed(self) -> bool:
         '''Check if the operation is allowed by the rate limit factor and current random value'''
-        self.current_value = self.rng.randint(0, 100)
+        self.current_value = self.rng.randint(0, self.max_limit)
         return self.current_value < self.current_limit
     
     def limit_info(self) -> str:

From 094af8a4cabc1991acb378abea55519a0428803d Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 31 Jul 2024 13:12:31 +0000
Subject: [PATCH 071/116] Timeout for long verifications

Issue #2755

(cherry picked from commit 259f7664aff18fd7843431e0c502e0ad744ceb72)
---
 pybuda/test/random/rgg/base.py      | 15 +++++++++++++++
 pybuda/test/random/rgg/config.py    |  1 +
 pybuda/test/random/rgg/datatypes.py |  1 +
 pybuda/test/random/rgg/utils.py     | 29 +++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+)

diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 5cec806a..79bda5a3 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -20,6 +20,7 @@
 from .datatypes import RandomizerNode, RandomizerGraph, RandomizerParameters, RandomizerConfig, ExecutionContext
 from .datatypes import RandomizerTestContext
 from .utils import StrUtils, GraphUtils
+from .utils import timeout, TimeoutException
 
 
 class GraphBuilder:
@@ -190,6 +191,20 @@ def build_model(self) -> PyBudaModule:
         return model
 
     def verify(self, model: PyBudaModule) -> None:
+
+        verification_timeout = self.test_context.randomizer_config.verification_timeout
+
+        try:
+            @timeout(verification_timeout)
+            def verify_model_timeout() -> None:
+                self.verify_model(model)
+
+            verify_model_timeout()
+        except TimeoutException as e:
+            logger.error(f"Module verification takes too long {e}.")
+            raise e
+
+    def verify_model(self, model: PyBudaModule) -> None:
         """
         Verify the model by building it and performing validation via PyBuda.
         The method is usually implemented once per framework.
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index 6eec52e8..e741f057 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -21,6 +21,7 @@ def get_randomizer_config_default():
         # build_model_from_code = False,
         debug_shapes = False,
         verify_shapes = False,
+        verification_timeout = int(os.environ.get("VERIFICATION_TIMEOUT", 60)),
         # TODO ranges
         # dim_min=int(os.environ.get("MIN_DIM", 3)),
         dim_min=int(os.environ.get("MIN_DIM", 4)),  # Until #2722 is resolved
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index ef6a9ce6..a1aecf31 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -132,6 +132,7 @@ class RandomizerConfig:
     # build_model_from_code: bool = False  # TODO remove obsoleted
     debug_shapes: bool = False,
     verify_shapes: bool = False,
+    verification_timeout: int = 60
     dim_min: int = 3
     dim_max: int = 4
     op_size_per_dim_min: int = 16
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 361231e1..d3bcb458 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -5,6 +5,7 @@
 
 
 import random
+import signal
 from typing import Callable, Generator, List, Dict
 from dataclasses import asdict
 from loguru import logger
@@ -249,3 +250,31 @@ def limit_info(self) -> str:
             return f"{self.current_value} < {self.current_limit}"
         else:
             return f"{self.current_value} >= {self.current_limit}"
+
+
+class TimeoutException(Exception):
+    pass
+
+
+# Handler for timeout signal
+def timeout_handler(signum, frame):
+    raise TimeoutException
+
+
+# Decorator for time limiting
+def timeout(seconds):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            # Set signal handler
+            signal.signal(signal.SIGALRM, timeout_handler)
+            # Set alarm
+            signal.alarm(seconds)
+            try:
+                result = func(*args, **kwargs)
+            finally:
+                # Shutdown alarm
+                signal.alarm(0)
+            return result
+        return wrapper
+    return decorator
+

From 170ed66dfd5a3459e2699a920fa43eb60754b8c0 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 31 Jul 2024 17:09:27 +0000
Subject: [PATCH 072/116] input_num_range support

Issue #2755

(cherry picked from commit 47044f83c18ae33bd340a3f87800aa1770425b8d)
---
 pybuda/pybuda/op_repo/__init__.py    |  4 ++++
 pybuda/pybuda/op_repo/datatypes.py   | 24 +++++++++++++++++++++++-
 pybuda/test/random/rgg/algorithms.py |  4 ++--
 pybuda/test/random/rgg/base.py       |  2 +-
 pybuda/test/random/rgg/datatypes.py  | 11 ++++++++++-
 pybuda/test/random/rgg/shapes.py     |  4 ++--
 pybuda/test/random/rgg/utils.py      |  2 +-
 7 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/pybuda/pybuda/op_repo/__init__.py b/pybuda/pybuda/op_repo/__init__.py
index b89fd354..b82a64f8 100644
--- a/pybuda/pybuda/op_repo/__init__.py
+++ b/pybuda/pybuda/op_repo/__init__.py
@@ -11,12 +11,16 @@
 #  - TVM python_codegen.py
 
 
+from .datatypes import OperandNumInt, OperandNumTuple, OperandNumRange
 from .datatypes import TensorShape, OperatorParam, OperatorParamNumber, OperatorDefinition, OperatorRepository
 from .datatypes import ShapeCalculationContext
 from .pybuda_operators import pybuda_operator_repository
 from .pytorch_operators import pytorch_operator_repository
 
 __ALL__ = [
+    "OperandNumInt",
+    "OperandNumTuple",
+    "OperandNumRange",
     "TensorShape",
     "OperatorParam",
     "OperatorParamNumber",
diff --git a/pybuda/pybuda/op_repo/datatypes.py b/pybuda/pybuda/op_repo/datatypes.py
index 9d189ba5..dcd4d129 100644
--- a/pybuda/pybuda/op_repo/datatypes.py
+++ b/pybuda/pybuda/op_repo/datatypes.py
@@ -24,11 +24,21 @@ class OperatorParamNumber:
 OperatorParam = Union[OperatorParamNumber]
 
 
+OperandNumInt = int
+OperandNumTuple = Tuple[int, int]
+
+
+@dataclass
+class OperandNumRange:
+    operands_min: int
+    operands_max: int
+
+
 @dataclass
 class OperatorDefinition:
     name: str
     full_name: str
-    input_num: int
+    input_num_range: Union[OperandNumInt, OperandNumTuple, OperandNumRange]
     instantiate: bool = False  # nn in Torch require instantiation in constructor
     constructor_params: List[OperatorParam] = field(default_factory=list)
     forward_code: Optional[Callable[[], str]] = None
@@ -36,6 +46,14 @@ class OperatorDefinition:
     operands: List[str] = field(default_factory=list)  # TODO describe operand and shapes
     calc_input_shapes: Optional[Callable[["ShapeCalculationContext", Random], List[TensorShape]]] = None  # calculate input shapes from output shape
 
+    def __post_init__(self):
+        if isinstance(self.input_num_range, OperandNumInt):
+            self.input_num_range = OperandNumRange(self.input_num_range, self.input_num_range)
+        elif isinstance(self.input_num_range, Tuple):
+            self.input_num_range = OperandNumRange(self.input_num_range[0], self.input_num_range[1])
+        else:
+            raise ValueError(f"Invalid input_num_range type {self.input_num_range}")
+
     @property
     def is_operator(self) -> bool:
         return not self.instantiate
@@ -51,6 +69,10 @@ class ShapeCalculationContext:
     def operator(self) -> OperatorDefinition:
         raise NotImplementedError("Operator is not defined")
 
+    @property
+    def input_num(self) -> int:
+        raise NotImplementedError("input_num is not defined")
+
     @property
     def constructor_kwargs(self) -> Dict[str, object]:
         raise NotImplementedError("constructor_kwargs is not defined")
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 07204fcd..32fac147 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -173,9 +173,9 @@ def validate_graph(cls, graph: RandomizerGraph):
 
         # Validation of input configuration
         for node in nodes:
-            if node.operator.input_num and node.operator.input_num > 1:
+            if node.input_num and node.input_num > 1:
                 if NodeUtils.num_of_open_inputs(node) > 0:
-                    raise Exception(f"Closed {NodeUtils.num_of_closed_inputs(node)}/{node.operator.input_num} inputs, missing {NodeUtils.num_of_open_inputs(node)} inputs for node {node.node_info}")
+                    raise Exception(f"Closed {NodeUtils.num_of_closed_inputs(node)}/{node.input_num} inputs, missing {NodeUtils.num_of_open_inputs(node)} inputs for node {node.node_info}")
 
         # Validation of operator and layer types
         for node in nodes:
diff --git a/pybuda/test/random/rgg/base.py b/pybuda/test/random/rgg/base.py
index 79bda5a3..3569c11a 100644
--- a/pybuda/test/random/rgg/base.py
+++ b/pybuda/test/random/rgg/base.py
@@ -74,7 +74,7 @@ def constructor_kwargs(self, node: RandomizerNode):
         return StrUtils.kwargs_str(**node.constructor_kwargs)
 
     def forward_args(self, node: RandomizerNode) -> str:
-        args_str = ", ".join([f"inputs[{i}]" for i in range(node.operator.input_num)])
+        args_str = ", ".join([f"inputs[{i}]" for i in range(node.input_num)])
         return args_str
     
     def forward_kwargs(self, node: RandomizerNode) -> str:
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index a1aecf31..2825d181 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -35,6 +35,7 @@ class RandomizerNode:
     index: Optional[int] = None
     out_value: Optional[str] = None
     operator: Optional[OperatorDefinition] = None
+    input_num: int = field(init=False)
     inputs: List['RandomizerNode'] = field(init=False)
     constructor_kwargs: Dict[str, object] = field(default_factory=dict)
     forward_kwargs: Dict[str, object] = field(default_factory=dict)
@@ -44,7 +45,11 @@ class RandomizerNode:
     def __post_init__(self):
         # List of input nodes is initialized with None values for each input
         # Inputs will be set later during graph construction
-        self.inputs = [None for _ in range(self.operator.input_num)]
+        self.input_num = self.operator.input_num_range.operands_min
+        self.init_inputs()
+
+    def init_inputs(self):
+        self.inputs = [None for _ in range(self.input_num)]
 
     @property
     def operator_name(self):
@@ -76,6 +81,10 @@ def __init__(self, node: RandomizerNode, test_context: 'RandomizerTestContext'):
     @property
     def operator(self) -> OperatorDefinition:
         return self.node.operator
+    
+    @property
+    def input_num(self) -> int:
+        return self.node.input_num
 
     @property
     def constructor_kwargs(self) -> Dict[str, object]:
diff --git a/pybuda/test/random/rgg/shapes.py b/pybuda/test/random/rgg/shapes.py
index 88a4c93f..0ccf6545 100644
--- a/pybuda/test/random/rgg/shapes.py
+++ b/pybuda/test/random/rgg/shapes.py
@@ -15,9 +15,9 @@ class OperatorShapes:
 
     @staticmethod
     def same_input_shapes(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-        operator, output_shape = calculation_context.operator, calculation_context.output_shape
+        input_num, output_shape = calculation_context.input_num, calculation_context.output_shape
         # each input operand has the same shape as the output
-        return [output_shape for _ in range(operator.input_num)]
+        return [output_shape for _ in range(input_num)]
 
     @staticmethod
     def linear_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index d3bcb458..514caa34 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -180,7 +180,7 @@ def num_of_open_inputs(cls, node: RandomizerNode) -> int:
 
     @classmethod
     def num_of_closed_inputs(cls, node: RandomizerNode) -> int:
-        return node.operator.input_num - cls.num_of_open_inputs(node)
+        return node.input_num - cls.num_of_open_inputs(node)
 
     @classmethod
     def is_open(cls, node: RandomizerNode) -> bool:

From a54a5e94ba844c00e7e8b0e6b86fef2395f8782e Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 30 Jul 2024 15:53:14 +0000
Subject: [PATCH 073/116] init_node_params before shape calculation

Issue #2755

(cherry picked from commit ca8ae0096ec444b919c5dfda0f642e5c860ac2f2)
---
 pybuda/test/random/rgg/algorithms.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 32fac147..01a1bcdd 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -136,25 +136,21 @@ def init_nodes_inputs(cls, test_context: RandomizerTestContext):
                 constant_node.out_value = f"iconst{iconst_index}"
 
     @classmethod
-    def init_nodes_params(cls, test_context: RandomizerTestContext):
+    def init_node_params(cls, node: RandomizerNode, test_context: RandomizerTestContext):
         """
-        Generates random parameters for each node.
+        Generates random parameters for specified node.
 
         Args:
+            node (RandomizerNode): The node.
             test_context (RandomizerTestContext): The test context.
 
         Returns:
             None
         """
-        nodes = test_context.graph.nodes
         rng_params = test_context.rng_params
 
-        logger.trace("Generating random settings for operator parameters")
-        # Generate random values for operator parameters
-        for node in nodes:
-            node.constructor_kwargs = RandomUtils.constructor_kwargs(node.operator, node.constructor_kwargs, rng_params)
-            node.forward_kwargs = RandomUtils.forward_kwargs(node.operator, node.forward_kwargs, rng_params)
-        logger.trace("Random settings for operator parameters generated")
+        node.constructor_kwargs = RandomUtils.constructor_kwargs(node.operator, node.constructor_kwargs, rng_params)
+        node.forward_kwargs = RandomUtils.forward_kwargs(node.operator, node.forward_kwargs, rng_params)
 
     @classmethod
     def validate_graph(cls, graph: RandomizerGraph):
@@ -190,7 +186,6 @@ def prepare_graph(cls, test_context: RandomizerTestContext):
         logger.trace("Initializing nodes")
         cls.init_nodes_names(test_context)
         cls.init_nodes_inputs(test_context)
-        cls.init_nodes_params(test_context)
         logger.trace("Nodes initialized")
 
         logger.trace("Validating graph")
@@ -318,6 +313,11 @@ def build_graph(self, test_context: RandomizerTestContext):
 
             # Creating new node
             node = RandomizerNode(operator=op1, output_shape=output_shape)
+
+            # Initializing node parameters
+            # Calculating input shapes may require input parameters for its calculation
+            GraphNodeSetup.init_node_params(node, test_context)
+
             # Saving input shapes for the new node
             shape_calculation_context.node = node
             node.input_shapes = NodeUtils.calc_input_shapes(node, shape_calculation_context)

From 48f3757b139f009d6bf68b963b4989ba8456763a Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 31 Jul 2024 16:11:12 +0000
Subject: [PATCH 074/116] Initializing random inputs

Initializing random inputs based on operand num range

Issue #2755

(cherry picked from commit e719526b8b86d41dc800a9a7ff4e8f3d628c3ca9)
---
 pybuda/test/random/rgg/algorithms.py |  3 +++
 pybuda/test/random/rgg/utils.py      | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 01a1bcdd..6cdf1658 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -318,6 +318,9 @@ def build_graph(self, test_context: RandomizerTestContext):
             # Calculating input shapes may require input parameters for its calculation
             GraphNodeSetup.init_node_params(node, test_context)
 
+            # Initializing random inputs based on operand num range
+            NodeUtils.init_random_inputs(node, test_context)
+
             # Saving input shapes for the new node
             shape_calculation_context.node = node
             node.input_shapes = NodeUtils.calc_input_shapes(node, shape_calculation_context)
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 514caa34..3ce2195d 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -214,6 +214,16 @@ def get_open_nodes_with_input_shape(cls, nodes: List[RandomizerNode], input_shap
     def calc_input_shapes(cls, node: RandomizerNode, shape_calculation_context: NodeShapeCalculationContext) -> List[TensorShape]:
         return node.operator.calc_input_shapes(shape_calculation_context)
 
+    @classmethod
+    def get_random_input_num(cls, node: RandomizerNode, test_context: RandomizerTestContext) -> int:
+        input_num_range = node.operator.input_num_range
+        return test_context.rng_graph.randint(input_num_range.operands_min, input_num_range.operands_max)
+
+    @classmethod
+    def init_random_inputs(cls, node: RandomizerNode, test_context: RandomizerTestContext) -> None:
+        node.input_num = cls.get_random_input_num(node, test_context)
+        node.init_inputs()
+
 
 class DebugUtils:
 

From a1503545b4dae155e3b4c702a9bef015d8c9dc3a Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 1 Aug 2024 10:41:43 +0000
Subject: [PATCH 075/116] Adjust parameters

Skip node if shape is invalid

Issue #2755

(cherry picked from commit 88805541eba9866cfbccf2771d99eb5042c74400)
---
 pybuda/test/random/rgg/algorithms.py | 38 +++++++++++++++++++++++++---
 pybuda/test/random/rgg/datatypes.py  |  6 +++++
 pybuda/test/random/rgg/shapes.py     |  6 +++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index 6cdf1658..e5c85127 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -13,10 +13,12 @@
 from .datatypes import NodeShapeCalculationContext
 from .datatypes import RandomizerInputNode
 from .datatypes import RandomizerConstantNode
+from .datatypes import InvalidShape
 from .base import RandomizerNode, GraphBuilder
 from .base import Framework
 from .utils import RandomUtils, StrUtils, NodeUtils
 from .utils import RateLimitter
+from .shapes import AdjustParameters
 
 
 class GraphNodeSetup:
@@ -223,6 +225,17 @@ def _init_default_constructor_params(cls, node: RandomizerNode):
         if len([param for param in node.operator.constructor_params if param.name == "out_channels"]) == 1:
             node.constructor_kwargs["out_channels"] = node.output_shape[1]
 
+    @classmethod
+    def _adjust_params(cls, node: RandomizerNode, test_context: RandomizerTestContext):
+
+        function_name = f"{node.operator.name}_adjust"
+        if function_name in AdjustParameters.__dict__:
+            logger.trace(f"Found method {function_name}")
+            adjust_params_method = AdjustParameters.__dict__[function_name]
+            adjust_params_method(node, test_context)
+        else:
+            pass
+
     # Build graph of random operators via random graph building algorithm
     # Graph contains between num_of_nodes_min and num_of_nodes_max nodes
     # Graph is constructed backwards starting from end node
@@ -253,11 +266,13 @@ def build_graph(self, test_context: RandomizerTestContext):
         # Building the graph with number of nodes between num_of_nodes_min and num_of_nodes_max
         num_of_nodes = rng_graph.randint(self.randomizer_config.num_of_nodes_min, self.randomizer_config.num_of_nodes_max) 
         for node_index in range(num_of_nodes, 0, -1):
-            first_node = node_index == num_of_nodes
+            first_node = len(nodes) == 0
 
             # Choose operator randomly based on rng
             op1 = self._get_random_operator(rng_graph)
 
+            node_name = f"op{node_index}[{op1.name}]"
+
             # Find all open nodes
             open_nodes = NodeUtils.get_open_nodes(nodes)
 
@@ -295,7 +310,7 @@ def build_graph(self, test_context: RandomizerTestContext):
                 # Increase fork join counter
                 new_fork_join = subset_count - 1
                 if new_fork_join > 0:
-                    logger.trace(f"Constructing {new_fork_join} new fork join(s) from operator op{node_index} {op1.name}")
+                    logger.trace(f"Constructing {new_fork_join} new fork join(s) from operator {node_name}")
                 fork_join_counter += new_fork_join
 
                 # Select random subset of open nodes to close
@@ -303,7 +318,7 @@ def build_graph(self, test_context: RandomizerTestContext):
 
                 if len(random_nodes) > 1:
                     for random_node in random_nodes[1:]:
-                        logger.trace(f"Constructing new fork join from operator op{node_index} {op1.name} -> {random_node.name}")
+                        logger.trace(f"Constructing new fork join from operator {node_name} -> {random_node.name}")
 
             else:
                 random_nodes = []
@@ -321,9 +336,24 @@ def build_graph(self, test_context: RandomizerTestContext):
             # Initializing random inputs based on operand num range
             NodeUtils.init_random_inputs(node, test_context)
 
+            try:
+                # Try to adjust parameters to avoid invalid shapes
+                self._adjust_params(node, test_context)
+            except InvalidShape as e:
+                # Skip node if shape doesn't support fixing
+                logger.warning(f"Invalid shape -> Skip node {node_name} because params adjustment failed: {e}")
+                # TODO repeat node generation with different operator
+                continue
+
             # Saving input shapes for the new node
             shape_calculation_context.node = node
-            node.input_shapes = NodeUtils.calc_input_shapes(node, shape_calculation_context)
+            try:
+                node.input_shapes = NodeUtils.calc_input_shapes(node, shape_calculation_context)
+            except InvalidShape as e:
+                # Skip node if shape is invalid
+                logger.warning(f"Invalid shape calculation -> Skip node {node_name}: {e}")
+                # TODO repeat node generation with different operator
+                continue
 
             # Initializing default constructor parameters based on input and output shapes
             self._init_default_constructor_params(node)
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 2825d181..37be75dd 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -170,3 +170,9 @@ class RandomizerTestContext:
     rng_shape: Optional[random.Random] = None
     # random number generators for parameters
     rng_params: Optional[random.Random] = None
+
+
+class InvalidShape(Exception):
+
+    def __init__(self, message):
+        super().__init__(message)
diff --git a/pybuda/test/random/rgg/shapes.py b/pybuda/test/random/rgg/shapes.py
index 0ccf6545..09c38e14 100644
--- a/pybuda/test/random/rgg/shapes.py
+++ b/pybuda/test/random/rgg/shapes.py
@@ -68,3 +68,9 @@ def randomize_size(n: int, rng_shape: Random) -> int:
     new_value = n + diff
     # logger.trace(f"Randomize size: {n} + {diff} -> {new_value}")
     return new_value
+
+
+class AdjustParameters:
+    '''Adjust parameters for operators based on output shape'''
+    # TODO Introduce adjustment method in operator definition similar to calc_input_shapes
+

From e36054bd09c5faf2d137999d1f0d6ac0fe951489 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 24 Jul 2024 13:26:48 +0000
Subject: [PATCH 076/116] Quantize shape

Issue #2755

(cherry picked from commit e6e05249b35221e0c65c351d21ed41602a844aff)
---
 pybuda/test/README.debug.md         |  5 ++--
 pybuda/test/random/rgg/config.py    |  1 +
 pybuda/test/random/rgg/datatypes.py |  1 +
 pybuda/test/random/rgg/shapes.py    | 46 ++++++++++++++++++-----------
 pybuda/test/random/rgg/utils.py     | 19 +++++++++++-
 5 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
index efe83e80..f02b36c0 100644
--- a/pybuda/test/README.debug.md
+++ b/pybuda/test/README.debug.md
@@ -6,8 +6,9 @@
  * RANDOM\_TESTS\_SELECTED: Limiting random tests to only selected subset defined as comma separated list of test indexes. E.x. "3,4,6". Default is no limitation if not specified or empty.
  * MIN\_DIM: Minimal number of dimensions of input tensors. (default: 3)
  * MAX\_DIM: Maximum number of dimensions of input tensors. (default: 4)
- * MIN\_OP\_SIZE\_PER\_DIM: Minimal size of an operator dimension. (default: 16)
- * MAX\_OP\_SIZE\_PER\_DIM: Maximum size of an operator dimension. Smaller operator size results in fewer failed tests. (default: 512)
+ * MIN\_OP\_SIZE\_PER\_DIM: Minimal size of an operand dimension. (default: 16)
+ * MAX\_OP\_SIZE\_PER\_DIM: Maximum size of an operand dimension. Smaller operand size results in fewer failed tests. (default: 512)
+ * OP\_SIZE\_QUANTIZATION: Quantization factor for operand size. (default: 1)
  * MIN_MICROBATCH_SIZE: Minimal size of microbatch of an input tensor. (default: 1)
  * MAX_MICROBATCH_SIZE: Maximum size of microbatch of an input tensor. (default: 8)
  * NUM\_OF\_NODES\_MIN: Minimal number of nodes to be generated by RGG. (default: 5)
diff --git a/pybuda/test/random/rgg/config.py b/pybuda/test/random/rgg/config.py
index e741f057..21dc8efb 100644
--- a/pybuda/test/random/rgg/config.py
+++ b/pybuda/test/random/rgg/config.py
@@ -29,6 +29,7 @@ def get_randomizer_config_default():
         op_size_per_dim_min=int(os.environ.get("MIN_OP_SIZE_PER_DIM", 16)),
         op_size_per_dim_max=int(os.environ.get("MAX_OP_SIZE_PER_DIM", 64)),  # by default run with smaller sizes
         # op_size_per_dim_max=int(os.environ.get("MAX_OP_SIZE_PER_DIM", 512)),
+        op_size_quantization=int(os.environ.get("OP_SIZE_QUANTIZATION", 1)),
         microbatch_size_min=int(os.environ.get("MIN_MICROBATCH_SIZE", 1)),
         microbatch_size_max=int(os.environ.get("MAX_MICROBATCH_SIZE", 8)),
         num_of_nodes_min=int(os.environ.get("NUM_OF_NODES_MIN", 5)),
diff --git a/pybuda/test/random/rgg/datatypes.py b/pybuda/test/random/rgg/datatypes.py
index 37be75dd..735484f7 100644
--- a/pybuda/test/random/rgg/datatypes.py
+++ b/pybuda/test/random/rgg/datatypes.py
@@ -146,6 +146,7 @@ class RandomizerConfig:
     dim_max: int = 4
     op_size_per_dim_min: int = 16
     op_size_per_dim_max: int = 512
+    op_size_quantization: int = 1
     microbatch_size_min: int = 1
     microbatch_size_max: int = 8
     num_of_nodes_min: int = 5
diff --git a/pybuda/test/random/rgg/shapes.py b/pybuda/test/random/rgg/shapes.py
index 09c38e14..68e51c4d 100644
--- a/pybuda/test/random/rgg/shapes.py
+++ b/pybuda/test/random/rgg/shapes.py
@@ -4,12 +4,15 @@
 # Calculation of input shapes from output shapes for the specified operator
 
 
-from random import Random
 from typing import List
 
 from .datatypes import TensorShape
 from .datatypes import ShapeCalculationContext
 
+from .datatypes import RandomizerTestContext
+
+from .utils import RandomUtils
+
 
 class OperatorShapes:
 
@@ -21,53 +24,62 @@ def same_input_shapes(calculation_context: ShapeCalculationContext) -> List[Tens
 
     @staticmethod
     def linear_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        output_shape = calculation_context.output_shape
+        test_context: RandomizerTestContext = calculation_context.test_context
         # linear layer changes the last dimension of the input shape
         batch_shape = output_shape[:-1]
         n = output_shape[-1]
-        n = randomize_size(n, rng_shape)
+        n = randomize_size(len(batch_shape), test_context)
         input_shapes = [batch_shape + (n,)]
         return input_shapes
 
     # FIXME: conv2d in PyTorch not working properly in all cases
     @staticmethod
     def conv2d_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        output_shape = calculation_context.output_shape
+        test_context: RandomizerTestContext = calculation_context.test_context
         shape1 = output_shape[:1]
         shape2 = output_shape[2:]
         n = output_shape[1]
-        n = randomize_size(n, rng_shape)
+        n = randomize_size(len(shape1), test_context)
         input_shapes = [shape1 + (n,) + shape2]
         return input_shapes
 
     @staticmethod
     def matmul_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
-        output_shape, rng_shape = calculation_context.output_shape, calculation_context.rng_shape
+        output_shape = calculation_context.output_shape
+        test_context: RandomizerTestContext = calculation_context.test_context
         batch_shape = output_shape[:-2]
         m = output_shape[-2]
         n = output_shape[-1]
         # calculates inner dimension based on one of output shape dimensions
-        q = randomize_size(n, rng_shape)
+        # dim is wrong for second operand
+        q = randomize_size(len(batch_shape) + 1, test_context)
         input_shapes = [batch_shape + (m,q), batch_shape + (q,n)]
         return input_shapes
 
 
-def randomize_size(n: int, rng_shape: Random) -> int:
-    '''Randomize size of an dimension based on size of another dimension.
-    Returns a random integer in the range [n/2, 3n/2] inclusive to keep the size of the dimension in a similar range.
+def randomize_size(dim: int, test_context: RandomizerTestContext) -> int:
+    '''Randomize size of a new dimension based operand size range
 
     Args:
-        n: size of an dimension
-        rng_shape: random number generator
+        dim (int): new dimension
+        test_context: RandomizerTestContext
 
     Returns:
         int: random size of an dimension
     '''
-    range = n // 2
-    diff = rng_shape.randint(-1 * range, max(range, 1))
-    new_value = n + diff
-    # logger.trace(f"Randomize size: {n} + {diff} -> {new_value}")
-    return new_value
+    rng_shape = test_context.rng_shape
+    randomizer_config = test_context.randomizer_config
+    op_size_min = randomizer_config.op_size_per_dim_min
+    op_size_max = randomizer_config.op_size_per_dim_max
+    quantization = randomizer_config.op_size_quantization
+
+    n = rng_shape.randint(op_size_min, op_size_max)
+    n = RandomUtils.quantize(n, quantization)
+    # logger.trace(f"Randomize size: dim = {dim}, quant = {quantization} -> {n}")
+
+    return n
 
 
 class AdjustParameters:
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 3ce2195d..5e4d22b0 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -102,6 +102,20 @@ def constructor_kwargs(cls, operator: OperatorDefinition, constructor_kwargs: Di
     def forward_kwargs(cls, operator: OperatorDefinition, forward_kwargs: Dict[str, object], rng_params: random.Random) -> Dict:
         return {param.name: cls.random_value_for_param(param, rng_params) if param.name not in forward_kwargs else forward_kwargs[param.name] for param in operator.forward_params}
 
+    @classmethod
+    def quantize(cls, value: int, quantization: int = 2) -> int:
+        '''Quantize the value to the nearest multiple of quantization
+
+        Args:
+            value (int): value to quantize
+            quantization (int, optional): quantization factor. Defaults to 2.
+
+        Returns:
+            int: quantized value
+        '''
+        # Using max to avoid quantizing to 0
+        return max(round(value / quantization) * quantization, quantization)
+
     @classmethod
     def random_shape(cls,
                      rng_shape: random.Random,
@@ -109,10 +123,11 @@ def random_shape(cls,
                      dim_max: int,
                      op_size_min: int,
                      op_size_max: int,
+                     quantization: int,
                      microbatch_size_min: int,
                      microbatch_size_max: int,
         ) -> TensorShape:
-        shape = [rng_shape.randint(op_size_min, op_size_max) for _ in range(rng_shape.randint(dim_min - 1, dim_max - 1))]
+        shape = [cls.quantize(rng_shape.randint(op_size_min, op_size_max), quantization) for _ in range(rng_shape.randint(dim_min - 1, dim_max - 1))]
         microbatch_size = rng_shape.randint(microbatch_size_min, microbatch_size_max)
         shape.insert(0, microbatch_size)
         shape = tuple(shape)
@@ -123,6 +138,7 @@ def random_shape(cls,
     def random_shape_from_config(cls, randomizer_config: RandomizerConfig, rng_shape: random.Random) -> TensorShape:
         op_size_min = randomizer_config.op_size_per_dim_min
         op_size_max = randomizer_config.op_size_per_dim_max
+        op_size_quantization = randomizer_config.op_size_quantization
 
         dim_min = randomizer_config.dim_min
         dim_max = randomizer_config.dim_max
@@ -136,6 +152,7 @@ def random_shape_from_config(cls, randomizer_config: RandomizerConfig, rng_shape
             dim_max=dim_max,
             op_size_min=op_size_min,
             op_size_max=op_size_max,
+            quantization=op_size_quantization,
             microbatch_size_min=microbatch_size_min,
             microbatch_size_max=microbatch_size_max,
         )

From 040d88a16eddc217fae0b0d697d2dc8ab9b7177e Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 31 Jul 2024 21:36:39 +0000
Subject: [PATCH 077/116] Nary operators in RGG

New PyBuda operators: stack, concatenate,
interleave and where

Issue #2755

(cherry picked from commit 1fe92b23a1449120ec45a7076fb6ef8d72c07c74)
---
 pybuda/pybuda/op_repo/pybuda_operators.py |  15 ++
 pybuda/test/random/rgg/shapes.py          | 199 +++++++++++++++++++++-
 pybuda/test/random/test_graphs.py         |  55 ++++++
 3 files changed, 267 insertions(+), 2 deletions(-)

diff --git a/pybuda/pybuda/op_repo/pybuda_operators.py b/pybuda/pybuda/op_repo/pybuda_operators.py
index 328226f4..835801b1 100644
--- a/pybuda/pybuda/op_repo/pybuda_operators.py
+++ b/pybuda/pybuda/op_repo/pybuda_operators.py
@@ -59,7 +59,22 @@
     OperatorDefinition("not_equal", "pybuda.op.NotEqual", 2),
     OperatorDefinition("logical_and", "pybuda.op.LogicalAnd", 2),
 
+    # Nary operators
+    OperatorDefinition("where", "pybuda.op.Where", 3),
+    # OperatorDefinition("index_copy", "pybuda.op.IndexCopy", 3),  # Bug #2705
+    OperatorDefinition("interleave", "pybuda.op.Interleave", (1,10), forward_params=[
+        OperatorParamNumber("axis", int, -3, -3),
+        OperatorParamNumber("stride", int, 1, 1),
+    ]),
+    OperatorDefinition("concatenate", "pybuda.op.Concatenate", (1, 10), forward_params=[
+        OperatorParamNumber("axis", int, -10, 10),
+    ]),
+    OperatorDefinition("stack", "pybuda.op.Stack", (2,4), forward_params=[
+        OperatorParamNumber("axis", int, 1, 10),
+    ]),
+
     OperatorDefinition("matmul", "pybuda.op.Matmul", 2),
+    # OperatorDefinition("sparse_matmul", "pybuda.op.SparseMatmul", 2),
 ]
 
 
diff --git a/pybuda/test/random/rgg/shapes.py b/pybuda/test/random/rgg/shapes.py
index 68e51c4d..92c34838 100644
--- a/pybuda/test/random/rgg/shapes.py
+++ b/pybuda/test/random/rgg/shapes.py
@@ -4,12 +4,16 @@
 # Calculation of input shapes from output shapes for the specified operator
 
 
+import random
+
+from loguru import logger
 from typing import List
 
 from .datatypes import TensorShape
-from .datatypes import ShapeCalculationContext
-
+from .datatypes import RandomizerNode
+from .datatypes import InvalidShape
 from .datatypes import RandomizerTestContext
+from .datatypes import ShapeCalculationContext
 
 from .utils import RandomUtils
 
@@ -58,6 +62,94 @@ def matmul_inputs(calculation_context: ShapeCalculationContext) -> List[TensorSh
         input_shapes = [batch_shape + (m,q), batch_shape + (q,n)]
         return input_shapes
 
+    @staticmethod
+    def interleave_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        # Interleave joins the input shapes along the specified axis
+        # It requires that axis dimension is divisible by the number of inputs
+        input_num, output_shape = calculation_context.input_num, calculation_context.output_shape
+        forward_kwargs = calculation_context.forward_kwargs
+        axis = forward_kwargs["axis"]
+
+        if axis >= len(output_shape) or axis < 0:
+            axis %= len(output_shape)
+
+        logger.trace(f"Interleave axis = {axis} output_shape = {output_shape}")
+
+        shape1 = output_shape[:axis]
+        mid_size = output_shape[axis]
+        shape2 = output_shape[axis+1:]
+
+        if mid_size < input_num:
+            raise InvalidShape(f"Output shape {output_shape} is too small mid_size={mid_size} < input_num={input_num}")
+
+        if mid_size % input_num != 0:
+            raise InvalidShape(f"Output shape {output_shape} axis[{axis}]={mid_size} is not divisible by input_num={input_num}")
+
+        dim = mid_size // input_num
+
+        input_shapes = [shape1 + (dim,) + shape2 for _ in range(input_num)]
+        return input_shapes
+
+    @staticmethod
+    def concatenate_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        # Concatenate joins the input shapes along the specified axis
+        # It requires that axis dimension can be split into input_num parts
+        input_num, output_shape = calculation_context.input_num, calculation_context.output_shape
+        test_context: RandomizerTestContext = calculation_context.test_context
+        rng_shape = test_context.rng_shape
+        forward_kwargs = calculation_context.forward_kwargs
+        axis = forward_kwargs["axis"]
+
+        if axis >= len(output_shape) or axis < 0:
+            axis %= len(output_shape)
+
+        logger.trace(f"Concatenate axis = {axis} output_shape = {output_shape}")
+
+        shape1 = output_shape[:axis]
+        mid_size = output_shape[axis]
+        shape2 = output_shape[axis+1:]
+
+        if mid_size < input_num:
+            raise InvalidShape(f"Output shape {output_shape} is too small mid_size={mid_size} < input_num={input_num}")
+
+        dims = []
+        for input_pos in range(input_num):
+            reserved_size = input_num - input_pos - 1
+            mid_range = mid_size - reserved_size
+            logger.trace(f"input_num = {input_num} mid_size = {mid_size} reserved_size = {reserved_size} mid_range = {mid_range}")
+            if mid_range <= 0:
+                raise InvalidShape(f"Output shape {output_shape} is too small mid_range={mid_range} <= 0")
+            if reserved_size == 0:
+                dim = mid_size
+            else:
+                # TODO quantize size
+                dim = rng_shape.randint(1, mid_range)
+            logger.trace(f"dim = {dim}")
+            mid_size -= dim
+            dims.append(dim)
+
+        input_shapes = [shape1 + (dim,) + shape2 for dim in dims]
+        return input_shapes
+
+    @staticmethod
+    def stack_inputs(calculation_context: ShapeCalculationContext) -> List[TensorShape]:
+        # Stack adds a new dimension at the specified axis
+        input_num, output_shape = calculation_context.input_num, calculation_context.output_shape
+        test_context: RandomizerTestContext = calculation_context.test_context
+        forward_kwargs = calculation_context.forward_kwargs
+        axis = forward_kwargs["axis"]
+
+        if len(output_shape) <= test_context.randomizer_config.dim_min:
+            raise InvalidShape(f"Output shape {output_shape} is too small len(output_shape)={len(output_shape)} <= dim_min={test_context.randomizer_config.dim_min}")
+        dim = output_shape[axis]
+        if dim != input_num:
+            raise InvalidShape(f"Mismatch of dim and input_num in output shape {output_shape}. dim={dim} != input_num={input_num}")
+        shape1 = output_shape[:axis]
+        shape2 = output_shape[axis+1:]
+
+        input_shapes = [shape1 + shape2 for _ in range(input_num)]
+        return input_shapes
+
 
 def randomize_size(dim: int, test_context: RandomizerTestContext) -> int:
     '''Randomize size of a new dimension based operand size range
@@ -86,3 +178,106 @@ class AdjustParameters:
     '''Adjust parameters for operators based on output shape'''
     # TODO Introduce adjustment method in operator definition similar to calc_input_shapes
 
+    @staticmethod
+    def interleave_adjust(node: RandomizerNode, test_context: RandomizerTestContext) -> None:
+        '''Adjust parameters and input number for interleave based on output shape'''
+        rng_shape = test_context.rng_shape
+        input_num_range = node.operator.input_num_range
+
+        input_num = node.input_num
+        output_shape = node.output_shape
+        axis = node.forward_kwargs["axis"]
+
+        if len(output_shape) < 4:
+            raise InvalidShape(f"Output shape {node.output_shape} has len(output_shape)={len(output_shape)} < 4")
+
+        if axis != -3:
+            raise InvalidShape(f"Invalid axis={axis} for output shape {node.output_shape}")
+
+        mid_size = output_shape[axis]
+
+        logger.trace(f"Interleave axis = {axis} output_shape = {output_shape} mid_size = {mid_size} input_num = {input_num}")
+
+        if mid_size % input_num == 0:
+            # If axis is divisible by input number, no need to recalculate
+            return
+
+        # Currently axis is required to be -3 so no need to change axis
+        supported_axises = [(axis, node.output_shape[axis])]
+        # supported_axises = list(enumerate(node.output_shape))
+
+        for axis, mid_size in rng_shape.sample(supported_axises, len(supported_axises)):
+            for input_num in rng_shape.sample(range(input_num_range.operands_min, input_num_range.operands_max+1), input_num_range.operands_max - input_num_range.operands_min + 1):
+                if mid_size % input_num == 0:
+                    node.forward_kwargs["axis"] = axis
+                    node.input_num = input_num
+                    node.init_inputs()
+                    return
+
+        raise InvalidShape(f"Not found possible params for output shape {node.output_shape}")
+
+    @staticmethod
+    def concatenate_adjust(node: RandomizerNode, test_context: RandomizerTestContext) -> None:
+        '''Adjust parameters and input number for concatenate based on output shape'''
+        rng_shape = test_context.rng_shape
+        input_num_range = node.operator.input_num_range
+
+        input_num = node.input_num
+        output_shape = node.output_shape
+        axis = node.forward_kwargs["axis"]
+
+        if not -len(output_shape) <= axis < len(output_shape):
+            axis = None  # must be recalculated
+
+        if axis is not None and axis % len(output_shape) == 0:
+            # Axis 0 is not supported
+            axis = None  # must be recalculated
+
+        if axis is not None:
+            # Maybe it's possible axis
+            axis %= len(output_shape)
+
+            mid_size = output_shape[axis]
+
+            if mid_size >= input_num:
+                # It is possible axis, no need to recalculate
+                return
+
+            # TODO global limit for number of operands
+            if input_num_range.operands_min <= mid_size <= input_num_range.operands_max:
+                # Axis is possible but number of inputs is too big
+                # Lower number of inputs to fit axis dimension
+                node.input_num = rng_shape.randint(input_num_range.operands_min, mid_size)
+                node.init_inputs()
+                return
+
+        # Try another axis
+        for axis, mid_size in rng_shape.sample(list(enumerate(node.output_shape)), len(node.output_shape)):
+            if axis % len(output_shape) == 0:
+                # Axis 0 is not supported
+                continue
+            if input_num_range.operands_min <= mid_size:
+                node.forward_kwargs["axis"] = axis
+                node.input_num = rng_shape.randint(input_num_range.operands_min, min(mid_size, input_num_range.operands_max))
+                node.init_inputs()
+                return
+        
+        raise InvalidShape(f"Not found possible params for output shape {node.output_shape}")
+
+    @staticmethod
+    def stack_adjust(node: RandomizerNode, test_context: RandomizerTestContext) -> None:
+        '''Adjust parameters and input number for stack based on output shape'''
+        input_num_range = node.operator.input_num_range
+        output_shape = node.output_shape
+        if len(output_shape) <= test_context.randomizer_config.dim_min:
+            raise InvalidShape(f"Output shape {output_shape} is too small len(output_shape)={len(output_shape)} <= dim_min={test_context.randomizer_config.dim_min}")
+        for axis, dim in enumerate(node.output_shape):
+            if axis == 0:
+                # Axis 0 is not supported
+                continue
+            if input_num_range.operands_min <= dim <= input_num_range.operands_max:
+                node.forward_kwargs["axis"] = axis
+                node.input_num = dim
+                node.init_inputs()
+                return
+        raise InvalidShape(f"Not found possible params for output shape {node.output_shape}")
diff --git a/pybuda/test/random/test_graphs.py b/pybuda/test/random/test_graphs.py
index 716b87f5..53012d8a 100644
--- a/pybuda/test/random/test_graphs.py
+++ b/pybuda/test/random/test_graphs.py
@@ -37,6 +37,9 @@ def healty_pybuda():
             "binary_stack",  # bug
             "power",  # occasionally fails
             "logical_and",  # bug
+
+            # Nary operators
+            "where",  # pcc?
         )
 
         framework = FrameworkTestUtils.copy_framework(Frameworks.PYBUDA.value, SKIP_OPERATORS)
@@ -90,7 +93,30 @@ def pybuda_matmul_joins():
 
         return framework
 
+    @staticmethod
+    def pybuda_nary():
+        SKIP_OPERATORS = (
+        )
+
+        framework = FrameworkTestUtils.copy_framework(Frameworks.PYBUDA.value, SKIP_OPERATORS)
+
+        ALLOW_OPERATORS = (
+            # "relu",
+            "tanh",
+            "add",
+            "matmul",  # Skip matmul to increase chance for stack operator
+            "interleave",
+            # "where",  # pcc?
+            "concatenate",
+            "stack",
+        )
+
+        FrameworkTestUtils.allow_operators(framework, ALLOW_OPERATORS)
+
+        return framework
+
     PYBUDA_MATMUL_JOINS = pybuda_matmul_joins()
+    PYBUDA_NARY = pybuda_nary()
 
 
 @pytest.mark.parametrize("framework", [
@@ -173,3 +199,32 @@ def test_random_graph_algorithm_pybuda_matmul_joins(test_index, random_seeds, te
     # TODO random_seed instead of random_seeds
     random_seed = random_seeds[test_index]
     process_test("Matmul Joins", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)
+
+
+# @pytest.mark.xfail(reason="Nary operators are buggy")
+@pytest.mark.parametrize("framework", [
+    FrameworksCustom.PYBUDA_NARY.value,
+])
+def test_random_graph_algorithm_pybuda_nary(test_index, random_seeds, test_device, randomizer_config: RandomizerConfig, framework):
+    # adjust randomizer_config
+    randomizer_config = copy(randomizer_config)
+    # randomizer_config.debug_shapes = True
+    # randomizer_config.verify_shapes = True
+    randomizer_config.dim_min = 3
+    randomizer_config.dim_max = 4
+    randomizer_config.op_size_per_dim_min = 2  # avoid failing tests with smaller dimensions?
+    # randomizer_config.op_size_per_dim_min = 4
+    # randomizer_config.op_size_per_dim_min = 16
+    randomizer_config.op_size_per_dim_max = 8
+    # randomizer_config.op_size_per_dim_max = 64
+    # randomizer_config.op_size_per_dim_max = 256
+    randomizer_config.op_size_quantization = 2
+    randomizer_config.microbatch_size_min = 1
+    randomizer_config.microbatch_size_max = 8
+    randomizer_config.num_of_nodes_min = 10
+    randomizer_config.num_of_nodes_max = 15
+    randomizer_config.num_fork_joins_max = 10
+
+    # TODO random_seed instead of random_seeds
+    random_seed = random_seeds[test_index]
+    process_test("Nary", test_index, random_seed, test_device, randomizer_config, graph_builder_type=RandomGraphAlgorithm, framework=framework)

From 29ce41ef87d8898626cb4a57accd0b9b890d3a3b Mon Sep 17 00:00:00 2001
From: Guangyu Feng <gfeng@tenstorrent.com>
Date: Thu, 1 Aug 2024 15:19:23 +0000
Subject: [PATCH 078/116] Code cleanup for data parallel

- Fix default envvar to be string type
- Rename the boolean variable to `is_data_parallel`
- Use loguru for debug prints

(cherry picked from commit 03a341e3508388f0da040ea121c37637602e43b9)
---
 pybuda/pybuda/compiled_graph_state.py | 5 +++--
 pybuda/pybuda/device_connector.py     | 8 ++++----
 pybuda/pybuda/run/impl.py             | 4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pybuda/pybuda/compiled_graph_state.py b/pybuda/pybuda/compiled_graph_state.py
index 1d276b88..c00e3962 100644
--- a/pybuda/pybuda/compiled_graph_state.py
+++ b/pybuda/pybuda/compiled_graph_state.py
@@ -5,6 +5,7 @@
 from typing import Dict, List, Any, Tuple, Optional
 from dataclasses import dataclass, field
 from enum import Enum
+from loguru import logger
 import inspect
 import os
 import json
@@ -215,8 +216,8 @@ def replicate_items(items_to_replicate):
             ordered_target_tile_broadcast_dims = ordered_target_tile_broadcast_dims + ordered_target_tile_broadcast_dims
             ordered_bw_input_tile_broadcast_dims = ordered_bw_input_tile_broadcast_dims + ordered_bw_input_tile_broadcast_dims
 
-            print(f"ordered_output_names = {ordered_output_names}")
-            print(f"ordered_output_shapes = {ordered_output_shapes}") #TODO: probably double here
+            logger.debug("ordered_output_names = {}", ordered_output_names)
+            logger.debug("ordered_output_shapes = {}", ordered_output_shapes)
 
         return CompiledGraphState(
             microbatch=graph.get_microbatch(),
diff --git a/pybuda/pybuda/device_connector.py b/pybuda/pybuda/device_connector.py
index 63650223..4d00397e 100644
--- a/pybuda/pybuda/device_connector.py
+++ b/pybuda/pybuda/device_connector.py
@@ -232,11 +232,11 @@ def _internal_push(self, tensors: List[Tensor]):
             print(f"Direct push queues have not been set for {self}")
         assert self.direct_push_queues, "Direct push queues have not been set"
         assert self.tile_broadcast_dims is not None
-        data_parallel = os.getenv("PYBUDA_N300_DATA_PARALLEL", 0)
-        assert len(tensors) == len(self.direct_push_queues) or data_parallel and len(tensors) * 2 == len(self.direct_push_queues), (
+        is_data_parallel = int(os.getenv("PYBUDA_N300_DATA_PARALLEL", "0"))
+        assert len(tensors) == len(self.direct_push_queues) or is_data_parallel and len(tensors) * 2 == len(self.direct_push_queues), (
                 f"Incorrect number of tensors provided on input: {len(tensors)} vs {len(self.direct_push_queues)}")
         assert self.runtime_tensor_transforms, "Runtime tensor transforms have not been set"
-        assert len(tensors) == len(self.runtime_tensor_transforms) or data_parallel and len(tensors) * 2 == len(self.runtime_tensor_transforms)
+        assert len(tensors) == len(self.runtime_tensor_transforms) or is_data_parallel and len(tensors) * 2 == len(self.runtime_tensor_transforms)
 
         self.push_to_side_queue(tensors)
 
@@ -250,7 +250,7 @@ def _internal_push(self, tensors: List[Tensor]):
             else:
                 tensors[i] = t
 
-        if data_parallel:
+        if is_data_parallel:
             new_tensors = []
             new_tensor_dtypes = []
             for i, t in enumerate(tensors):
diff --git a/pybuda/pybuda/run/impl.py b/pybuda/pybuda/run/impl.py
index 7b915fbd..0c1e317d 100644
--- a/pybuda/pybuda/run/impl.py
+++ b/pybuda/pybuda/run/impl.py
@@ -1233,8 +1233,8 @@ def _compile_devices(
 
     devices = get_devices()
     microbatch_size, inputs = _get_device_zero_inputs(sample_inputs)
-    data_parallel = os.getenv("PYBUDA_N300_DATA_PARALLEL", 0)
-    if data_parallel:
+    is_data_parallel = int(os.getenv("PYBUDA_N300_DATA_PARALLEL", "0"))
+    if is_data_parallel:
         assert microbatch_size > 1, "microbatch size is expected to be >= 1 for data parallel"
         microbatch_size = int(microbatch_size / 2)
 

From 76d0e993c1e91810edb4118e38646e2221118741 Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Fri, 9 Aug 2024 12:39:52 +0000
Subject: [PATCH 079/116] Add optional parameters to pytorch binary operators
 test

(cherry picked from commit babb41c89833b3ec9336a0c279b8988fc9e39c5f)
---
 .../eltwise_binary/test_pytorch_binary.py     | 33 ++++++++++++++-----
 pybuda/test/random/rgg/__init__.py            |  2 ++
 pybuda/test/random/rgg/frameworks.py          | 24 +++++++-------
 pybuda/test/random/rgg/utils.py               |  1 +
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
index 206f56f8..24dc2d4e 100644
--- a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
+++ b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
@@ -12,6 +12,7 @@
 from typing import List, Dict, Type
 from loguru import logger
 
+import random
 import torch
 import pybuda
 import pybuda.op
@@ -20,6 +21,7 @@
 from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
 from test.conftest import TestDevice
+from test.random.rgg import RateLimitter
 
 
 class ModelFromAnotherOp(torch.nn.Module):
@@ -200,25 +202,25 @@ def get_eltwise_binary_ops():
     return [
         "add",                      #00
         "div",                      #01
-        "divide",                   #02
+        "divide",                   #02     - Alias for div.
         "mul",                      #03
-        "multiply",                 #04
+        "multiply",                 #04     - Alias for mul.
         "sub",                      #05
-        "subtract",                 #06
-        "true_divide",              #07
+        "subtract",                 #06     - Alias for sub.
+        "true_divide",              #07     - Alias for div with rounding_mode=None.
         "eq",                       #08
         "ne",                       #09
         "le",                       #10
         "ge",                       #11
-        "greater",                  #12
-        "greater_equal",            #13
+        "greater",                  #12    - Alias for gt.
+        "greater_equal",            #13    - Alias for ge.
         "gt",                       #14
-        "less_equal",               #15
+        "less_equal",               #15    - Alias for le.
         "lt",                       #16
-        "less",                     #17
+        "less",                     #17    - Alias for lt.
         "maximum",                  #18
         "minimum",                  #19
-        "not_equal",                #20
+        "not_equal",                #20    - Alias for ne.
     ]
 
 def get_input_shapes():
@@ -353,12 +355,21 @@ def test_pytorch_eltwise_binary_ops_per_test_plan(
     if model_type == ModelFromDramQueue:
         input_source_flag = InputSourceFlags.FROM_DRAM
 
+    kwargs = {}
+    if input_operator in ["add", "sub", "substract"] and kwargs_limiter.is_allowed():
+        kwargs['alpha'] = random.uniform(0.5, 1000)
+    elif input_operator in ["div", "divide"]:
+        rounding_modes = ['trunc', 'floor', None]
+        kwargs['rounding_mode'] = rounding_modes[random.randint(0, 2)]
+
+
     verify(
         test_device=test_device,
         model_type=model_type,
         input_operator=input_operator,
         input_shape=input_shape,
         number_of_operands=2,
+        kwargs=kwargs,
         input_source_flag=input_source_flag,
         dev_data_format=dev_data_format,
         math_fidelity=input_math_fidelity,
@@ -381,6 +392,10 @@ def test_pytorch_eltwise_binary_ops_per_test_plan(
             assert input_operator not in key
 
 
+rng_limiter = random.Random(0)
+kwargs_limiter = RateLimitter(rng_limiter, 100, 50)
+
+
 def get_not_implemented_pytorch_binary_ops():
     return [
         "atan2",                    #00           - NotImplementedError: The following operators are not implemented: ['aten::atan2']
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index ebc41e18..3d3177ef 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -11,6 +11,7 @@
 from .config import get_randomizer_config_default
 from .utils import StrUtils, GraphUtils
 from .utils import DebugUtils
+from .utils import RateLimitter
 from .base import Framework, GraphBuilder, ModelBuilder
 from .base import RandomizerRunner, RandomizerCodeGenerator, process_test
 from .frameworks import Frameworks
@@ -33,6 +34,7 @@
     "StrUtils",
     "GraphUtils",
     "DebugUtils",
+    "RateLimitter",
     "Framework",
     "GraphBuilder",
     "ModelBuilder",
diff --git a/pybuda/test/random/rgg/frameworks.py b/pybuda/test/random/rgg/frameworks.py
index d8bbbbcb..0e5b3e23 100644
--- a/pybuda/test/random/rgg/frameworks.py
+++ b/pybuda/test/random/rgg/frameworks.py
@@ -69,22 +69,22 @@ def set_calc_input_shapes(cls, framework: Framework, allow_operators: Tuple[str]
                 operator.calc_input_shapes = OperatorShapes.same_input_shapes
 
 
-class Frameworks(Enum):
-    ''' Register of all frameworks '''
+def build_framework(framework_name: str, ModelBuilderType: Type[ModelBuilder], operator_repository: OperatorRepository):
+    framework = Framework(
+        framework_name=framework_name,
+        ModelBuilderType=ModelBuilderType,
+        operator_repository=operator_repository,
+    )
 
-    @staticmethod
-    def build_framework(framework_name: str, ModelBuilderType: Type[ModelBuilder], operator_repository: OperatorRepository):
-        framework = Framework(
-            framework_name=framework_name,
-            ModelBuilderType=ModelBuilderType,
-            operator_repository=operator_repository,
-        )
+    framework = FrameworkTestUtils.copy_framework(framework=framework, skip_operators=())
 
-        framework = FrameworkTestUtils.copy_framework(framework=framework, skip_operators=())
+    FrameworkTestUtils.set_calc_input_shapes(framework)
 
-        FrameworkTestUtils.set_calc_input_shapes(framework)
+    return framework
 
-        return framework
+
+class Frameworks(Enum):
+    ''' Register of all frameworks '''
 
     PYBUDA = build_framework(
         framework_name="PyBuda",
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 5e4d22b0..5224055c 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -257,6 +257,7 @@ def debug_inputs(cls, inputs: List[pybuda.Tensor]):
         logger.info(f"inputs: {cls.format_tensors(inputs)}")
 
 
+# TODO: rename to RateLimiter
 class RateLimitter:
     '''Rate limitter class to limit the number of allowed operations by a rate limit factor'''
 

From 2d3ca4d895391fbda76de1fff82d07f0850f2cf0 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Thu, 1 Aug 2024 20:33:16 +0000
Subject: [PATCH 080/116] Bringup QDQ Convnext

Update third_party

(cherry picked from commit 6b84758b77fd14433ef4587aacddd7d2f9df8c19)
---
 pybuda/csrc/buda_passes.cpp                   |   3 +
 pybuda/csrc/graph_lib/utils.cpp               |   6 +-
 pybuda/csrc/passes/commute_utils.cpp          |  32 +-
 pybuda/csrc/passes/erase_inverse_ops.cpp      |  15 +-
 pybuda/csrc/passes/explicate_unsqueeze.cpp    |   1 +
 .../passes/fuse_redundant_tm_sequence.cpp     | 281 ++++++++++++++++--
 .../passes/fuse_redundant_tm_sequence.hpp     | 208 ++++++++++---
 ...nsert_inverse_outside_quantized_region.cpp |  27 +-
 pybuda/csrc/passes/insert_qdq_on_biases.cpp   |  24 +-
 pybuda/csrc/passes/make_quantized_ops.cpp     |   8 +-
 pybuda/csrc/passes/remove_quant_dequant.cpp   |   9 +-
 pybuda/pybuda/op/eval/pybuda/quantize.py      |   9 +-
 12 files changed, 498 insertions(+), 125 deletions(-)

diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 64e971fe..779de593 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -11,6 +11,7 @@
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/query.hpp"
 #include "graph_lib/utils.hpp"
+#include "passes/commute_utils.hpp"
 #include "passes/bind_reshape_to_io.hpp"
 #include "passes/constant_folding.hpp"
 #include "passes/dataformat.hpp"
@@ -187,6 +188,8 @@ void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &d
     while(attempt_update) {
         passes::insert_inverse_outside_quantized_region(graph);
         attempt_update = passes::erase_inverse_ops(graph);
+        if (not attempt_update)
+            attempt_update = passes::fuse_tm_sequences(graph);
     }
 
     recalculate_shapes(graph);
diff --git a/pybuda/csrc/graph_lib/utils.cpp b/pybuda/csrc/graph_lib/utils.cpp
index dff84075..f3dcc589 100644
--- a/pybuda/csrc/graph_lib/utils.cpp
+++ b/pybuda/csrc/graph_lib/utils.cpp
@@ -535,10 +535,11 @@ void fork_subgraph(Graph *graph, Node *node) {
                 graph->node_by_id(user_edge.consumer_node_id)->name());
             
             std::string clone_name = input->name() + "_subgraph_fork_clone_" + std::to_string(user_edge.edge_creation_id);
-            Node *clone = graph->add_node(
+            TaggedNode *clone = graph->add_node(
                 input->clone(clone_name), 
-                graph->get_subgraph_id_for_node(input->id()));
+                graph->get_subgraph_id_for_node(input->id()))->as<TaggedNode>();
 
+            clone->tag("forked_from", input->name());
             auto attr = graph->get_edge_attributes(user_edge);
             graph->remove_edge(user_edge);
             // Replace user operand_edge
@@ -1458,6 +1459,7 @@ void handle_change_rank(graphlib::Graph *graph, graphlib::Edge edge)
         change_rank->set_shape(producer->shape().as_rank(rank));
         change_rank->tag("dont_erase", true);
         auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, edge, change_rank);
+        change_rank->set_output_df_from_operands(graph);
         if (try_consteval_op(graph, change_rank))
             return graph->operand_data_edges(consumer)[0];
 
diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index 7d233207..de222279 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -607,7 +607,7 @@ bool commute_through_reduce(
         }
         
     }
-    else {
+    else if (initial_op->op_name() != "transpose") {
 
     
         for (graphlib::Node* next_node : next_nodes) {
@@ -691,14 +691,18 @@ bool commute_through_reduce(
         
         if (not can_commute)
         {
-            auto [can_commute, new_dim] = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
+            // auto can_comm_new_dim = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
+            auto can_comm_new_dim = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
+            can_commute = std::get<0>(can_comm_new_dim);
+            auto new_dim = std::get<1>(can_comm_new_dim);
             if (can_commute)
             {
                 graphlib::Shape updated_commute_shape = *commute_shape;
-                if (producer)
+                if (commute_up)
                 {
-                    TT_ASSERT(commute_up, "Should only be using producer for shape if commuting up");
-                    updated_commute_shape[new_dim] = producer->shape().as_vector()[reduce_dim];
+                    // Producer may not be passed but we still need its shape
+                    graphlib::Shape producer_shape = graph->data_operands(op)[0]->shape();
+                    updated_commute_shape[new_dim] = producer_shape[reduce_dim];
                 }
                 else
                 {
@@ -708,10 +712,11 @@ bool commute_through_reduce(
                 if (clone_shape != nullptr)
                 {
                     graphlib::Shape updated_clone_shape = *clone_shape;
-                    if (producer)
+                    if (commute_up)
                     {
-                        TT_ASSERT(commute_up, "Should only be using producer for shape if commuting up");
-                        updated_clone_shape[reduce_dim] = producer->shape().as_vector()[reduce_dim];
+                        // Producer may not be passed but we still need its shape
+                        graphlib::Shape producer_shape = graph->data_operands(op)[0]->shape();
+                        updated_clone_shape[reduce_dim] = producer_shape[reduce_dim];
                     }
                     else
                     {
@@ -918,7 +923,8 @@ bool commute_through_quantization(
     bool can_commute = false;
 
     if (initial_op->op_type().op == "reshape") {
-        
+        if (axis == -1)
+            can_commute = true;
         // axis of quantization must have the same volume to the left and right of it
         if (new_axis < 0)
             new_axis += op->shape().size();
@@ -967,10 +973,12 @@ bool commute_through_quantization(
 
     TT_ASSERT(can_commute, "Should not have called this if it is incommutable.");
 
-    std::vector<graphlib::OpType::Attr> op_attrs = op->op_attrs();
-    op_attrs[1] = new_axis;
+    if (axis != -1) {
+        std::vector<graphlib::OpType::Attr> op_attrs = op->op_attrs();
+        op_attrs[1] = new_axis;
+        op->overwrite_op_attrs(op_attrs);
+    }
     op->set_shape(*commute_shape);
-    op->overwrite_op_attrs(op_attrs);
     op->add_golden_transform(*golden_transform);
     return true;
 }
diff --git a/pybuda/csrc/passes/erase_inverse_ops.cpp b/pybuda/csrc/passes/erase_inverse_ops.cpp
index ceaf9746..94ee0ed9 100644
--- a/pybuda/csrc/passes/erase_inverse_ops.cpp
+++ b/pybuda/csrc/passes/erase_inverse_ops.cpp
@@ -256,19 +256,7 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
             log_trace(LogGraphCompiler, "  Operand commute clone: {} -> between {} and {} ", name, consumer->name(), graph->node_by_id(operand_edge.producer_node_id)->name());
 
             // Special case for operand clones on a quantization scale
-            auto *consumer_op = dynamic_cast<graphlib::OpNode *>(consumer);
-            if (is_quantization_ops(consumer_op) and operand_index == 1) {
-                
-                // The shape should be all 1's except for (possiby) the quantization axis
-                auto updated_commute_shape = commute_shape;
-                int quant_axis = std::get<int>(consumer_op->op_attrs()[1]);
-                updated_commute_shape[quant_axis] = consumer_op->shape()[quant_axis];
-                update_reshape_attr(op, updated_commute_shape);
-                clone->set_shape(updated_commute_shape);
-                log_trace(LogGraphCompiler, "  Operand commute clone shape: {}", updated_commute_shape);
-                
-            }
-            else if (retain_operand_dim)
+            if (retain_operand_dim)
             {
                 auto updated_commute_shape = commute_shape;
                 updated_commute_shape[operand_dims.second] = graph->node_by_id(operand_edge.producer_node_id)->shape()[operand_dims.first];
@@ -327,7 +315,6 @@ void commute_and_bypass(graphlib::Graph *graph, std::vector<graphlib::Node *> co
             auto [in_edge, out_edge] = insert_node_on_edge(graph, operand_edge, clone);
             // Set dataformat to match producer on operand edge
             clone->set_output_df(graph->node_by_id(in_edge.producer_node_id)->output_df());
-
             handle_change_rank(graph, clone);
             try_commute_bcast_through_clone(graph, op);
             if (graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(graph->data_operands(clone)[0]))
diff --git a/pybuda/csrc/passes/explicate_unsqueeze.cpp b/pybuda/csrc/passes/explicate_unsqueeze.cpp
index 158bb3aa..0f6d5bea 100644
--- a/pybuda/csrc/passes/explicate_unsqueeze.cpp
+++ b/pybuda/csrc/passes/explicate_unsqueeze.cpp
@@ -59,6 +59,7 @@ void explicate_unsqueeze(graphlib::Graph *graph)
                     auto current_edge = graph->get_edges(current_node, eltwise)[0];
                     auto current_tms = graph->get_edge_attributes(current_edge)->get_tms();
                     auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, current_edge, change_rank);
+                    change_rank->set_output_df_from_operands(graph);
                     graph->get_edge_attributes(incoming_edge)->set_tms({});
                     graph->get_edge_attributes(outgoing_edge)->set_tms(current_tms);
                     current_node = change_rank;
diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
index 0171519d..4765a32f 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
@@ -2,11 +2,126 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "passes/fuse_redundant_tm_sequence.hpp"
-
+#include "passes/commute_utils.hpp"
 
 using tt::LogTMFusion;
 namespace tt::passes
 {
+ 
+std::pair<graphlib::OpType, graphlib::Shape> generate_inverse_info(graphlib::Graph *graph, graphlib::OpNode *op) {
+    
+    TT_ASSERT(op->op_name() == "reshape" or op->op_name() == "transpose", "Op must be reshape or transpose.");
+
+    if (op->op_name() == "reshape") {
+        graphlib::Shape inverse_shape = op->shape_of_operand(graph, graph->data_operands(op)[0]);
+        graphlib::OpType inverse_op_type = op->op_type();
+        std::vector<BudaOpAttr> new_attrs;
+        for (uint32_t dim : inverse_shape) {
+            new_attrs.push_back((int)dim);
+        }
+
+        inverse_op_type.attr = new_attrs;
+        return std::make_pair(inverse_op_type, inverse_shape);
+    }
+    else {
+        graphlib::Shape inverse_shape = op->shape_of_operand(graph, graph->data_operands(op)[0]);
+        // Attrs remain the same for inverse transpose
+        return std::make_pair(op->op_type(), inverse_shape);
+    }
+}
+
+void clone_tm_on_all_user_forks(graphlib::Graph *graph, graphlib::OpNode *tm) {
+    TT_ASSERT(graph->data_users(tm).size() > 1);
+
+    for (auto user_edge : graph->user_data_edges(tm)) {
+        std::string clone_name = tm->name() + "_redundant_tm_pattern_tm_user_fork_clone" + std::to_string(user_edge.edge_creation_id);
+        graphlib::Node *clone = graph->add_node(tm->clone(clone_name), graph->get_subgraph_id_for_node(tm->id()));
+        insert_node_on_edge(graph, user_edge, clone);
+    }
+    log_debug(LogGraphCompiler, "Moving forks of TM: {} to operand.", tm->name());
+    bypass_node(graph, tm, true);
+}
+
+void clone_tms_on_forks(graphlib::Graph *graph, std::vector<graphlib::OpNode *> tms_to_move) {
+    // Iterate through these backwards
+    for (int i = tms_to_move.size()-1; i >= 0; i--) {
+        graphlib::OpNode *tm = tms_to_move[i];
+        clone_tm_on_all_user_forks(graph, tm);
+    }
+}
+
+bool swap_down_through_eltwise(graphlib::Graph *graph, graphlib::OpNode *tm) {
+    std::vector<graphlib::Node *> users = graph->data_users(tm);
+    if (users.size() > 1) {
+        return false;
+    }
+
+    py::object eval_module = py::module_::import("pybuda.op.eval.pybuda");
+    py::function is_tm = eval_module.attr("is_tm");
+
+    TT_ASSERT(is_tm(tm->op_type()).cast<bool>());
+
+    graphlib::OpNode *user = users[0]->as<graphlib::OpNode>();
+
+    TT_ASSERT(user, "User must be elteise or quantization op.");
+
+    if (not (user and (is_eltwise(user) or is_quantization_ops(user))))
+        return false;
+
+    // Don't want to check the operand that is tm;
+    std::vector<graphlib::Edge> user_operands = graph->operand_data_edges(user, [tm](graphlib::Edge edge) {
+        return edge.producer_node_id != tm->id();
+    });
+    std::vector<graphlib::Edge> user_users = graph->user_data_edges(user);
+
+    // Place an inverse on each operand
+    auto [inverse_op_type, inverse_shape] = generate_inverse_info(graph, tm);
+    for (auto operand_edge : user_operands) {
+        std::string inverse_clone_name = tm->name() + "_redundant_tm_pattern_tm_commute_operand_clone" + std::to_string(operand_edge.edge_creation_id);
+        graphlib::OpNode *inverse_clone = graph->add_node(tm->clone(inverse_clone_name), graph->get_subgraph_id_for_node(tm->id()))->as<graphlib::OpNode>();
+        inverse_clone->overwrite_op_attrs(inverse_op_type.attr);
+        inverse_clone->set_shape(inverse_shape);
+        insert_node_on_edge(graph, operand_edge, inverse_clone);
+        inverse_clone->set_output_df_from_operands(graph);
+
+        
+        if (graph->data_operands(inverse_clone)[0]->node_type() == graphlib::NodeType::kInput)
+        {
+            try_consteval_op(graph, inverse_clone, true);
+        }
+    }
+
+    // Move node down and attach all users 
+    graphlib::Edge old_tm_user_edge = retrieve_between_edge(graph, tm, user);
+    graphlib::Edge new_edge = graphlib::Edge(graph->data_operands(tm)[0]->id(), old_tm_user_edge.producer_output_port_id, user->id(), old_tm_user_edge.consumer_input_port_id, old_tm_user_edge.edge_type);
+    graph->add_edge(new_edge);
+    graphlib::Edge old_tm_operand_edge = retrieve_between_edge(graph, graph->data_operands(tm)[0], tm);
+    graph->remove_edge(old_tm_operand_edge);
+    graph->remove_edge(old_tm_user_edge);
+
+    for (uint32_t i = 0; i < user_users.size(); i++) {
+        graphlib::Edge new_user_user_edge = graphlib::Edge(tm->id(), i, user_users[i].consumer_node_id, user_users[i].consumer_input_port_id, user_users[i].edge_type);
+        graph->remove_edge(user_users[i]);
+        graph->add_edge(new_user_user_edge);
+    }
+
+    graphlib::Edge new_user_tm_edge = graphlib::Edge(user->id(), 0, tm->id(), 0, new_edge.edge_type);
+    graph->add_edge(new_user_tm_edge);
+    user->set_shape(inverse_shape);
+
+    user->add_golden_transform(tm->op_type());
+
+    return true;
+}
+
+void move_down_through_eltwise(graphlib::Graph *graph, std::vector<graphlib::OpNode *> tms_to_move) {
+    // Iterate through these backwards
+    for (int i = tms_to_move.size()-1; i >= 0; i--) {
+        graphlib::OpNode *tm = tms_to_move[i];
+
+        while (swap_down_through_eltwise(graph, tm));
+    }
+}
 
 bool equivalent_pattern(const TMPattern& pattern1, const TMPattern& pattern2) {
     if (pattern1.size() != pattern2.size())
@@ -20,30 +135,41 @@ bool equivalent_pattern(const TMPattern& pattern1, const TMPattern& pattern2) {
             // If both side want to check attrs, then they must be the same
             TT_ASSERT(pattern1[i].attrs.size() == pattern2[i].attrs.size());
             TT_ASSERT(pattern1[i].op_name == "transpose", "Only support attrs check for transpose op");
-            for (uint32_t j = 0; j < 2; j++) {
-                if (pattern1[i].attrs[j] != pattern2[i].attrs[j])
-                    return false;
-            }
+
+            bool dim1_match_dim1 = pattern1[i].attrs[0] == pattern2[i].attrs[0];
+            bool dim1_match_dim2 = pattern1[i].attrs[0] == pattern2[i].attrs[1];
+
+            bool dim2_match_dim1 = pattern1[i].attrs[1] == pattern2[i].attrs[0];
+            bool dim2_match_dim2 = pattern1[i].attrs[1] == pattern2[i].attrs[1];
+
+            if (not ((dim1_match_dim1 and dim2_match_dim2) or (dim2_match_dim1 and dim1_match_dim2)))
+                return false;
         }
     }
+
     return true;
 }
 
-graphlib::Shape replacement_output_shape(graphlib::Shape input_shape, const TMPattern& pattern) {
+std::vector<std::pair<graphlib::Shape, TMPattern>> replacement_output_shape(graphlib::Shape input_shape, const std::vector<TMPattern>& patterns) {
     py::object eval_module = py::module_::import("pybuda.op.eval.pybuda");
 
-    for (uint i = 0; i < pattern.size(); i++) {
-        auto op_name = pattern[i].op_name;
-        auto attrs = pattern[i].attrs;
-
-        py::function pybuda_shape = eval_module.attr("get_f_pybuda_shape")(pattern[i].as_op_type());
-        std::vector<std::vector<std::uint32_t>> operand_tuples;
-        operand_tuples.push_back(input_shape.as_vector());
-        py::tuple ret = pybuda_shape(operand_tuples);
-        graphlib::Shape shape = graphlib::Shape::create(ret[0].cast<std::vector<std::uint32_t>>());
-        input_shape = shape;
+    std::vector<std::pair<graphlib::Shape, TMPattern>> shapes_patterns;
+
+    for (TMPattern pattern : patterns) {
+        for (uint32_t i = 0; i < pattern.size(); i++) {
+            auto op_name = pattern[i].op_name;
+            auto attrs = pattern[i].attrs;
+
+            py::function pybuda_shape = eval_module.attr("get_f_pybuda_shape")(pattern[i].as_op_type());
+            std::vector<std::vector<std::uint32_t>> operand_tuples;
+            operand_tuples.push_back(input_shape.as_vector());
+            py::tuple ret = pybuda_shape(operand_tuples);
+            graphlib::Shape shape = graphlib::Shape::create(ret[0].cast<std::vector<std::uint32_t>>());
+            input_shape = shape;
+        }
+        shapes_patterns.push_back(std::make_pair(input_shape, pattern));
     }
-    return input_shape;
+    return shapes_patterns;
 }
 
 std::string pattern_to_string(const TMPattern& pattern) {
@@ -68,21 +194,22 @@ bool replace_pattern_with_new_pattern(
     const TMPattern& current_pattern, 
     const TMPattern& replace_pattern, 
     graphlib::Node *sequence_producer, 
-    graphlib::Node * terminal_node) {
+    std::vector<graphlib::OpNode *> pattern_sequence) {
 
     log_debug(LogTMFusion, "Trying to replace pattern from {} to {}.", pattern_to_string(current_pattern), pattern_to_string(replace_pattern));
 
     bool multiple_user = false;
     std::vector<graphlib::Node *> users;
     graphlib::Node * fuse_node = nullptr;
+    graphlib::Node *terminal_node = pattern_sequence.back();
+    pattern_sequence.pop_back();
 
     // Check whether the matched pattern has multiple user or not
     // if there are multiple user at the end of the pattern matched node and
     // multiple user are same op and same shape
     // then the matched pattern can be fused by using replace pattern
     // and other user nodes are connected to the fused op.
-    auto current_node = graph->users(sequence_producer)[0];
-    while (current_node != terminal_node) {
+    for (auto current_node : pattern_sequence) {
         users = graph->users(current_node);
         if (users.size() > 1) {
             bool user_is_terminal_node = std::find(users.begin(), users.end(), terminal_node) != users.end();
@@ -117,34 +244,29 @@ bool replace_pattern_with_new_pattern(
             break;
 
         }
-        current_node = users[0];
     }
 
     // remove the edges of the users if it is same op and same shape
     if (multiple_user) {
         for (auto& user : users) {
             if (user != terminal_node) {
-                auto edge_to_remove = graph->get_edges(current_node, user)[0];
+                auto edge_to_remove = graph->get_edges(pattern_sequence.back(), user)[0];
                 graph->remove_edge(edge_to_remove);
             }
         }
     }
     // Bypass all nodes until the end of the current pattern
-    current_node = graph->users(sequence_producer)[0];
-
     // remove old pattern
-    while (current_node != terminal_node) {
+   for (auto current_node : pattern_sequence) {
         TT_ASSERT(graph->users(current_node).size() == 1);
-        auto next_node = graph->users(current_node)[0];
         bypass_node(graph, current_node, true);
-        current_node = next_node;
     }
 
     TT_ASSERT(graph->get_edges(sequence_producer, terminal_node).size() == 1);
     auto current_edge = graph->get_edges(sequence_producer, terminal_node)[0];
     for (uint i = 0; i < replace_pattern.size(); i++) {
         auto op = replace_pattern[i];
-        std::string name = sequence_producer->name() + "_fused_tm_op_" + std::to_string(i);
+        std::string name = sequence_producer->name() + "_fused_tm_op_" + std::to_string(current_edge.edge_creation_id);
         auto new_node = graph->add_node(
             std::make_unique<graphlib::PyOpNode>(name, op.as_op_type()), graph->get_subgraph_id_for_node(sequence_producer->id()));
         fuse_node = new_node;
@@ -171,6 +293,11 @@ bool replace_pattern_with_new_pattern(
 
 bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
 
+    // Want to match the largest patterns first
+    std::sort(pattern_map.begin(), pattern_map.end(), [](const std::pair<TMPattern, std::vector<TMPattern>> &a, const std::pair<TMPattern, std::vector<TMPattern>> &b) {
+        return a.first.size() > b.first.size();
+    });
+
     bool updated = true;
     py::object eval_module = py::module_::import("pybuda.op.eval.pybuda");
     py::function is_tm = eval_module.attr("is_tm");
@@ -181,12 +308,15 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
         // Loop through pre-defined TM patterns
         for (auto & pattern : pattern_map) {
             auto search_pattern = pattern.first;
-            auto replace_pattern = pattern.second;
+            auto replace_patterns = pattern.second;
 
             TMPattern current_pattern;
+            std::vector<graphlib::OpNode*> pattern_sequence;
             graphlib::Node * sequence_producer = nullptr;
             graphlib::Shape sequence_input_shape;
             bool potential_prefix = true;
+            graphlib::OpNode *blocking_eltwise = nullptr;
+            graphlib::OpNode *forked_tm = nullptr;
 
             // Topological traversal to find the search pattern
             for (auto *node : graphlib::topological_sort(*graph))
@@ -198,12 +328,21 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                 if (op->as<graphlib::TaggedNode>()->tag_value_or("dont_erase", false))
                     continue;
 
+                if ((is_eltwise(op) or is_quantization_ops(op)) and current_pattern.size() > 0){
+                    blocking_eltwise = op;
+                    pattern_sequence.push_back(op);
+                    continue;
+                }
+
                 if (not is_tm(op->op_type()).cast<bool>())
                 {
                     // Clear and try find another viable candidate
                     current_pattern.clear();
+                    pattern_sequence.clear();
                     sequence_producer = nullptr;
                     potential_prefix = true;
+                    blocking_eltwise = nullptr;
+                    forked_tm = nullptr;
                     continue;
                 }
 
@@ -213,13 +352,21 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                     sequence_producer = graph->operands(op)[0];
                 }
 
+                if (graph->data_users(op).size() > 1) {
+                    forked_tm = op;
+                }
+
                 current_pattern.emplace_back(op->op_type(), true);
+                pattern_sequence.push_back(op);
 
                 // Check for match
                 if (current_pattern.size() > search_pattern.size()) {
                     // Clear and try find another viable candidate
                     current_pattern.clear();
+                    pattern_sequence.clear();
                     sequence_producer = nullptr;
+                    blocking_eltwise = nullptr;
+                    forked_tm = nullptr;
                     potential_prefix = true;
                     continue;
 
@@ -235,7 +382,10 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                     if (not potential_prefix) {
                         // Try find another viable candidate
                         current_pattern.clear();
+                        pattern_sequence.clear();
                         sequence_producer = nullptr;
+                        blocking_eltwise = nullptr;
+                        forked_tm = nullptr;
                         potential_prefix = true;
                         continue;
                     }
@@ -243,17 +393,82 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                     // Check if current pattern matches search pattern
                     bool same_pattern = equivalent_pattern(current_pattern, search_pattern);
 
+
+                    if (forked_tm and forked_tm != pattern_sequence.back() and same_pattern) {
+                        bool forked_tm_after_eltwise = false;
+                        std::vector<graphlib::OpNode *> tms_to_move;
+                        for (graphlib::OpNode *seq_op : pattern_sequence) {
+                            if (not is_tm(seq_op->op_type()).cast<bool>()) {
+                                forked_tm_after_eltwise = true;
+                                break;
+                            }
+                            TT_ASSERT(is_tm(seq_op->op_type()).cast<bool>());
+                            
+                            tms_to_move.push_back(seq_op);
+                            if (seq_op == forked_tm)
+                                break;
+                        }
+                        if (not forked_tm_after_eltwise)
+                        {
+                            clone_tms_on_forks(graph, tms_to_move);
+                            current_pattern.clear();
+                            pattern_sequence.clear();
+                            sequence_producer = nullptr;
+                            blocking_eltwise = nullptr;
+                            forked_tm = nullptr;
+                            updated = true;
+                            updated_anything = true;
+                            potential_prefix = true;
+                            continue;
+                        }
+                    }
+
+                    // Check if there is a blocking eltwise, if there is then move all tms in the pattern above down
+                    if (blocking_eltwise and same_pattern) {
+                        std::vector<graphlib::OpNode *> tms_to_move;
+                        for (graphlib::OpNode *seq_op : pattern_sequence) {
+                            if (seq_op == blocking_eltwise)
+                                break;
+
+                            if (is_tm(seq_op->op_type()).cast<bool>())
+                                tms_to_move.push_back(seq_op);
+                        }
+                        move_down_through_eltwise(graph, tms_to_move);
+                        current_pattern.clear();
+                        pattern_sequence.clear();
+                        sequence_producer = nullptr;
+                        blocking_eltwise = nullptr;
+                        forked_tm = nullptr;
+                        updated = true;
+                        updated_anything = true;
+                        potential_prefix = true;
+                        continue;
+                    }
+
                     // Verify i/o shape by calling pybuda shape function
-                    graphlib::Shape output_shape = replacement_output_shape(sequence_input_shape, replace_pattern);
+                    auto shapes_patterns = replacement_output_shape(sequence_input_shape, replace_patterns);
+                    
+                    bool found_match = false;
+                    TMPattern matching_pattern;
+                    for (auto shape_pattern : shapes_patterns) {
+                        graphlib::Shape output_shape = shape_pattern.first;
+                        TMPattern pattern = shape_pattern.second;
+                        if (output_shape == op->shape()) {
+                            found_match = true;
+                            matching_pattern = pattern;
+                            break;
+                        }
+                    }
 
                     // Make sure output shape is the same after replacement
-                    bool same_shape = output_shape == op->shape();
-                    if (same_pattern and same_shape) {
+                    if (same_pattern and found_match) {
                         // Replace current pattern with replace pattern
-                        bool is_pattern_replaced = replace_pattern_with_new_pattern(graph, current_pattern, replace_pattern, sequence_producer, node);
+                        bool is_pattern_replaced = replace_pattern_with_new_pattern(graph, current_pattern, matching_pattern, sequence_producer, pattern_sequence);
                         // Break and reset
                         current_pattern.clear();
+                        pattern_sequence.clear();
                         sequence_producer = nullptr;
+                        blocking_eltwise = nullptr;
                         updated = is_pattern_replaced;
                         if (is_pattern_replaced)
                             updated_anything = is_pattern_replaced;
diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
index b9556ab1..53111bf8 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.hpp
@@ -47,7 +47,7 @@ namespace tt::passes
     };
 
     using TMPattern = std::vector<OpTypeItem>;
-    using TMPatternPairs = std::vector<std::pair<TMPattern, TMPattern>>;
+    using TMPatternPairs = std::vector<std::pair<TMPattern, std::vector<TMPattern>>>;
 
     // PreDefine TM sequence pattern
     static TMPattern pattern_0 = {
@@ -240,45 +240,175 @@ namespace tt::passes
         OpTypeItem("reshape", {1, 28, 28, 36}, false),
     };
 
+    static TMPattern replace_4_10 = {
+        OpTypeItem("reshape", {1, 56, 56, 96}, false),
+    };
+
+    static TMPattern replace_4_11 = {
+        OpTypeItem("reshape", {1, 14, 14, 384}, false),
+    };
+
+    static TMPattern replace_4_12 = {
+        OpTypeItem("reshape", {1, 7, 7, 768}, false),
+    };
+
+    static TMPattern replace_4_13 = {
+        OpTypeItem("reshape", {1, 28, 28, 192}, false),
+    };
+
+    static TMPattern pattern_5 = {
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("transpose", {-3, -2, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+    };
+
+    static TMPattern replace_5_0 = {
+        OpTypeItem("reshape", {1, 56, 56, 96}, false),
+    };
+
+    static TMPattern pattern_6 = {
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+    };
+
+    static TMPattern replace_6_0 = {
+        OpTypeItem("reshape", {1, 1, 3136, 96}, false),
+    };
+
+    static TMPattern replace_6_1 = {
+        OpTypeItem("reshape", {1, 1, 784, 192}, false),
+    };
+
+    static TMPattern replace_6_2 = {
+        OpTypeItem("reshape", {1, 1, 196, 384}, false),
+    };
+
+    static TMPattern pattern_7 {
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+    };
+
+    static TMPattern replace_7_0 = {
+        OpTypeItem("reshape", {1, 1, 3136, 96}, false),
+    };
+
+    static TMPattern replace_7_1 = {
+        OpTypeItem("reshape", {1, 1, 784, 192}, false),
+    };
+
+    static TMPattern replace_7_2 = {
+        OpTypeItem("reshape", {1, 1, 196, 384}, false),
+    };
+
+    static TMPattern pattern_8 {
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+    };
+
+    static TMPattern replace_8_0 = {
+        OpTypeItem("reshape", {1, 1, 784, 192}, false),
+    };
+
+    static TMPattern pattern_9 = {
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-1, -2, -1}, true),
+    };
+
+    static TMPattern replace_9_0 = {
+        OpTypeItem("reshape", {1, 1, 196, 384}, false),
+    };
+
+    static TMPattern pattern_10 = {
+        OpTypeItem("reshape", {}, false),
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("transpose", {-2, -1, -1}, true),
+        OpTypeItem("transpose", {-3, -1, -1}, true),
+    };
+
+    static TMPattern replace_10_0 = {
+        OpTypeItem("reshape", {1, 14, 14, 384}, false),
+    };
+    
     static TMPatternPairs pattern_map = {
-        {pattern_0, replace_0},
-        {pattern_1, replace_1},
-        {pattern_2, replace_2_0},
-        {pattern_2, replace_2_1},
-        {pattern_2, replace_2_2},
-        {pattern_2, replace_2_3},
-        {pattern_2, replace_2_4},
-        {pattern_2, replace_2_5},
-        {pattern_2, replace_2_6},
-        {pattern_2, replace_2_7},
-        {pattern_2, replace_2_8},
-        {pattern_2, replace_2_9},
-        {pattern_2, replace_2_10},
-        {pattern_2, replace_2_11},
-        {pattern_2, replace_2_12},
-        {pattern_2, replace_2_13},
-        {pattern_2, replace_2_14},
-        {pattern_2, replace_2_15},
-        {pattern_3, replace_3_0},
-        {pattern_3, replace_3_1},
-        {pattern_3, replace_3_2},
-        {pattern_3, replace_3_3},
-        {pattern_3, replace_3_4},
-        {pattern_3, replace_3_5},
-        {pattern_3, replace_3_6},
-        {pattern_3, replace_3_7},
-        {pattern_3, replace_3_8},
-        {pattern_3, replace_3_9},
-        {pattern_4, replace_4_0},
-        {pattern_4, replace_4_1},
-        {pattern_4, replace_4_2},
-        {pattern_4, replace_4_3},
-        {pattern_4, replace_4_4},
-        {pattern_4, replace_4_5},
-        {pattern_4, replace_4_6},
-        {pattern_4, replace_4_7},
-        {pattern_4, replace_4_8},
-        {pattern_4, replace_4_9},
+        {pattern_0, {replace_0}},
+        {pattern_1, {replace_1}},
+        {pattern_2, {
+            replace_2_0,
+            replace_2_1,
+            replace_2_2,
+            replace_2_3,
+            replace_2_4,
+            replace_2_5,
+            replace_2_6,
+            replace_2_7,
+            replace_2_8,
+            replace_2_9,
+            replace_2_10,
+            replace_2_11,
+            replace_2_12,
+            replace_2_13,
+            replace_2_14,
+            replace_2_15,
+        }},
+        {pattern_3, {
+            replace_3_0,
+            replace_3_1,
+            replace_3_2,
+            replace_3_3,
+            replace_3_4,
+            replace_3_5,
+            replace_3_6,
+            replace_3_7,
+            replace_3_8,
+            replace_3_9,
+        }},
+        {pattern_4, {
+            replace_4_0,
+            replace_4_1,
+            replace_4_2,
+            replace_4_3,
+            replace_4_4,
+            replace_4_5,
+            replace_4_6,
+            replace_4_7,
+            replace_4_8,
+            replace_4_9,
+            replace_4_10,
+            replace_4_11,
+            replace_4_12,
+            replace_4_13,
+        }},
+        {pattern_5, {replace_5_0}},
+        {pattern_6, {
+            replace_6_0,
+            replace_6_1,
+            replace_6_2,
+        }},
+        {pattern_7, {
+            replace_7_0,
+            replace_7_1,
+            replace_7_2,
+        }},
+        {pattern_8, {replace_8_0}},
+        {pattern_9, {
+            replace_9_0,
+        }},
+        {pattern_10, {
+            replace_10_0,
+        }},
     };
 
     bool fuse_tm_sequences(tt::graphlib::Graph* graph, TMPatternPairs& pattern_map_ = pattern_map);
diff --git a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
index c3acf49c..f952c4bc 100644
--- a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
+++ b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
@@ -86,8 +86,8 @@ static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape>
 
     graphlib::OpNode *iter = initial_op;
 
-    auto clone_shape = initial_op->shape();
-    auto commute_shape = shape_of_only_operand(graph, initial_op);
+    auto commute_shape = initial_op->shape();
+    auto clone_shape = shape_of_only_operand(graph, initial_op);
 
     bool found_quantize = false;
     while (not found_quantize) {
@@ -119,7 +119,7 @@ static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape>
             for (graphlib::Edge operand_edge : operand_edges) {
                 // If the operand of this edge is already an inverse to this op, dont bother returning the edge
                 graphlib::OpNode *operand = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(operand_edge.producer_node_id));
-                if (operand and not are_compatible_ops(graph, initial_op, operand, &commute_shape))
+                if (not (operand and are_compatible_ops(graph, initial_op, operand, &commute_shape)))
                     operands_outside.push_back(operand_edge);
             }
         }
@@ -169,7 +169,7 @@ void insert_inverse_transpose_pair(graphlib::Graph *graph, graphlib::OpNode *tra
         graphlib::Shape clone_shape = operand->shape();
         clone_op->set_shape(clone_shape);
         if (below)
-            clone_op->tag("dont_erase", "true");
+            clone_op->tag("dont_erase", true);
     }
     
 }
@@ -183,9 +183,11 @@ void insert_inverse_reshape_pair(graphlib::Graph *graph, graphlib::OpNode *resha
         auto *clone_inverse = graph->add_node(reshape_op->clone(inverse_name), graph->get_subgraph_id_for_node(edge.consumer_node_id));
         graphlib::OpNode *clone_inverse_op = dynamic_cast<graphlib::OpNode *>(clone_inverse);
         clone_inverse_op->set_shape(commute_shape);
-        if (not below)
-            clone_inverse_op->tag("dont_erase", true);
         update_reshape_attr(clone_inverse_op, commute_shape);
+        if (not below) {
+            clone_inverse_op->tag("dont_erase", true);
+        }
+        
         auto [incoming_edge, outgoing_edge] = insert_node_on_edge(graph, edge, clone_inverse_op);
         clone_inverse_op->set_output_df_from_operands(graph);
 
@@ -195,10 +197,19 @@ void insert_inverse_reshape_pair(graphlib::Graph *graph, graphlib::OpNode *resha
         graphlib::OpNode *clone_op = dynamic_cast<graphlib::OpNode *>(clone);
         clone_op->set_shape(clone_shape);
         update_reshape_attr(clone_op, clone_shape);
+        if (below) {
+            clone_op->tag("dont_erase", true);
+        }
         insert_node_on_edge(graph, outgoing_edge, clone_op);
+        handle_change_rank(graph, clone_op);
         clone_op->set_output_df_from_operands(graph);
-        if (below)
-            clone_op->tag("dont_erase", true);
+        auto *input = dynamic_cast<graphlib::InputNode *>(graph->node_by_id(edge.producer_node_id));
+        if (input)
+        {
+            try_consteval_op(graph, clone_inverse_op, true);
+        } else {
+            handle_change_rank(graph, clone_inverse_op);
+        }
     }
 }
 
diff --git a/pybuda/csrc/passes/insert_qdq_on_biases.cpp b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
index 0c354566..7644118e 100644
--- a/pybuda/csrc/passes/insert_qdq_on_biases.cpp
+++ b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
@@ -18,7 +18,7 @@ namespace tt::passes
 {
 
 bool can_insert_on_conv2d_bias(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
-    if (conv2d->op_type().op != "conv2d")
+    if (conv2d->op_type().op != "conv2d" and conv2d->op_type().op != "conv2d_transpose")
         return false;
 
     if (graph->data_operands(conv2d).size() != 3)
@@ -63,6 +63,15 @@ bool can_insert_on_matmul_bias(graphlib::Graph *graph, graphlib::OpNode *add) {
         return false;
     }
 
+    // The first non-TM op above the dequantize must be a matmul, or else this isnt a matmul bias-add
+    py::object eval_module = py::module_::import("pybuda.op.eval.pybuda");
+    py::function is_tm = eval_module.attr("is_tm");
+    graphlib::OpNode *operand = dynamic_cast<graphlib::OpNode *>(graph->data_operands(deq)[0]);
+    while (operand and is_tm(operand->op_type()).cast<bool>() and graph->data_operands(operand).size() == 1) {
+        operand = dynamic_cast<graphlib::OpNode *>(graph->data_operands(operand)[0]);
+    }
+    if (not operand or operand->op_name() != "matmul")
+        return false;
     
     // For now, the way we know this is a bias-add is if the dequantize nodes input has output_df Int32 
     // This is because the quantized matmul above returns an Int32.
@@ -97,6 +106,7 @@ bool insert_qdq_on_matmul_bias(graphlib::Graph *graph, graphlib::OpNode *add) {
         bias = graph->data_operands(add)[0];
         bias_is_rhs = false;
     }
+
     int axis = std::get<int>(deq->op_attrs()[1]);
     // Insert unsqueezes to to match the rank of add
     handle_change_rank(graph, add);
@@ -107,8 +117,13 @@ bool insert_qdq_on_matmul_bias(graphlib::Graph *graph, graphlib::OpNode *add) {
         bias = graph->data_operands(add)[0];
     }
 
-
     graphlib::Node *scale = graph->data_operands(deq)[1];
+    // Find matching dim for axis
+    for (uint32_t i = 0; i < bias->shape().size(); i++) {
+        if (bias->shape()[i] == scale->shape()[0])
+            axis = (int)i;
+    }
+
     graphlib::Edge add_bias_edge = retrieve_between_edge(graph, bias, add);
     std::vector<graphlib::OpType::Attr> quant_attrs{0.0f, axis, std::string("torch.int32")};
     std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, axis};
@@ -226,9 +241,10 @@ bool insert_qdq_on_conv2d_bias(graphlib::Graph *graph, graphlib::OpNode *conv2d)
     return true;
 }
 
-const std::array<std::string, 2> quantizeable_ops{
+const std::array<std::string, 3> quantizeable_ops{
     "add",
-    "conv2d"
+    "conv2d",
+    "conv2d_transpose"
 };
 bool insert_qdq_on_biases(graphlib::Graph *graph) {
     
diff --git a/pybuda/csrc/passes/make_quantized_ops.cpp b/pybuda/csrc/passes/make_quantized_ops.cpp
index f0dbb184..b0446d53 100644
--- a/pybuda/csrc/passes/make_quantized_ops.cpp
+++ b/pybuda/csrc/passes/make_quantized_ops.cpp
@@ -195,9 +195,11 @@ void make_quantized_add(graphlib::Graph *graph, graphlib::OpNode *add) {
     graphlib::Node *scale = graph->data_operands(deq0)[1];
 
     int new_deq_axis = std::get<int>(deq1->op_attrs()[1]);
-    if (new_deq_axis >= 0)
-        new_deq_axis = new_deq_axis - deq1->shape().size() + add->shape().size();
-
+    // Find matching dim for axis
+    for (uint32_t i = 0; i < add->shape().size(); i++) {
+        if (add->shape()[i] == scale->shape()[0])
+            new_deq_axis = (int)i;
+    }
     
     std::vector<graphlib::OpType::Attr> dequant_attrs{0.0f, new_deq_axis};
     for (graphlib::Edge consumer_edge : graph->user_data_edges(add)) {
diff --git a/pybuda/csrc/passes/remove_quant_dequant.cpp b/pybuda/csrc/passes/remove_quant_dequant.cpp
index 98985775..6cba3d2a 100644
--- a/pybuda/csrc/passes/remove_quant_dequant.cpp
+++ b/pybuda/csrc/passes/remove_quant_dequant.cpp
@@ -85,11 +85,11 @@ void bypass_qdq_pair(graphlib::Graph *graph, graphlib::OpNode *quantize, graphli
 
         uint32_t max_scale_shape = std::max<uint32_t>(dequant_scale->shape()[0], quant_scale->shape()[0]);
         graphlib::Shape scale_miltiply_shape = graphlib::Shape::create(std::vector<uint32_t>{max_scale_shape});
-        scale_multiply->set_shape(scale_miltiply_shape);
 
         graph->add_edge(dequant_scale, scale_multiply);
         graph->add_edge(quant_scale_recip, scale_multiply);
         scale_multiply->set_output_df_from_operands(graph);
+        scale_multiply->set_shape(dequant_scale->shape());
 
         // Potentially add broadcast on scale edge if one of the scales is not shaped [1]
         if (dequant_scale->shape()[0] != quant_scale->shape()[0]) {
@@ -113,10 +113,10 @@ void bypass_qdq_pair(graphlib::Graph *graph, graphlib::OpNode *quantize, graphli
         graphlib::Node *bias = graph->data_operands(quantize)[0];
         graph->add_edge(scale_multiply, bias_multiply);
         bias_multiply->set_shape(bias->shape());
-        bias_multiply->set_output_df_from_operands(graph);
 
         graphlib::Edge bias_quant_edge = retrieve_between_edge(graph, bias, quantize);
         insert_node_on_edge(graph, bias_quant_edge, bias_multiply);
+        bias_multiply->set_output_df_from_operands(graph);
         
         graph->remove_edge(dequant_scale_edge);
         graph->remove_edge(quant_scale_edge);
@@ -152,11 +152,6 @@ bool remove_quant_dequant(graphlib::Graph *graph) {
             if (op_node->op_type().op != "quantize" or op_child->op_type().op != "dequantize")
                 continue;
 
-
-            // Quantize should be producing an int8
-            // if (std::get<std::string>(op_child->op_attrs()[4]) != std::string("torch.int8"))
-            //     continue;
-
             bypass_qdq_pair(graph, op_node, op_child);
             graph_changed = true;
             attempt_update = true;
diff --git a/pybuda/pybuda/op/eval/pybuda/quantize.py b/pybuda/pybuda/op/eval/pybuda/quantize.py
index 97c264a7..545fefe0 100644
--- a/pybuda/pybuda/op/eval/pybuda/quantize.py
+++ b/pybuda/pybuda/op/eval/pybuda/quantize.py
@@ -152,6 +152,9 @@ def shape(type, attr, ops):
             elif op0[dim] == 1:  # We broadcast even if dims are both one in order to use unsqueeze from broadcast function
                 broadcast.append((1, dim, 1))
 
+    if "buda" in type:
+        assert attr[1] == -1, "decomposed quantization ops must have their axis set to be invalid (-1)."
+
     return ops[0], []
 
 
@@ -199,7 +202,7 @@ def decompose(type, attr, dc, inputs):
                 scale = dc.op("broadcast", [scale], attrs=(i-len(scale_shape), act.shape[i]), output_df=scale.output_df)
                 scale_shape = list(scale.shape)
 
-        out = dc.op("buda_quantize", [inputs[0], scale], attrs=attr, output_df=buda_dtype)
+        out = dc.op("buda_quantize", [inputs[0], scale], attrs=[zero_point, -1, out_dtype], output_df=buda_dtype)
         dc.fuse(out)
         return
 
@@ -249,7 +252,7 @@ def decompose(type, attr, dc, inputs):
 
         torch_dtype = STRING_TO_TORCH_DTYPE[out_dtype]
         buda_dtype = pytorch_dtype_to_buda_dataformat(torch_dtype)
-        out = dc.op("buda_requantize", [act, new_scale], attrs=(out_zp, axis, rounding, out_dtype),output_df=buda_dtype)
+        out = dc.op("buda_requantize", [act, new_scale], attrs=(out_zp, -1, rounding, out_dtype),output_df=buda_dtype)
         dc.fuse(out)
         return
 
@@ -278,6 +281,6 @@ def decompose(type, attr, dc, inputs):
                 scale = dc.op("broadcast", [scale], attrs=(i-len(scale_shape), act.shape[i]), output_df=scale.output_df)
                 scale_shape = list(scale.shape)
 
-        out = dc.op("buda_dequantize", [act, scale], attrs=attr,)
+        out = dc.op("buda_dequantize", [act, scale], attrs=[zero_point, -1],)
         dc.fuse(out)
         return

From 3edbc578e773906a1837b62d62afde4df3521eb7 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Mon, 12 Aug 2024 17:37:09 +0000
Subject: [PATCH 081/116] Bringup QDQ mobilenetv3

(cherry picked from commit 7383040c55caa8bbd28f7e86745ab4cd78d7c981)
---
 pybuda/csrc/buda_passes.cpp                   |  2 +
 ...nsert_inverse_outside_quantized_region.cpp | 31 ++++++++--
 pybuda/csrc/passes/insert_qdq_on_biases.cpp   |  3 +
 pybuda/csrc/passes/make_quantized_ops.cpp     | 58 +++++++++++++++++++
 pybuda/csrc/passes/make_quantized_ops.hpp     |  1 +
 5 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 779de593..033111ea 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -105,6 +105,8 @@ run_post_initial_graph_passes(graphlib::Graph *graph, py::object compiler_cfg_ob
     bool attempt_update = true;
     while (attempt_update) {
         attempt_update = passes::move_dequantize(graph);
+        if (env_as<bool>("PYBUDA_DISABLE_CONV_BIAS_QDQ_INSERTION"))
+            passes::separate_conv2d_bias(graph);
         attempt_update |= passes::make_quantized_ops(graph);
         attempt_update |= passes::insert_qdq_on_biases(graph);
         attempt_update |= passes::dequant_quant_to_requant(graph);
diff --git a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
index f952c4bc..12d00c83 100644
--- a/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
+++ b/pybuda/csrc/passes/insert_inverse_outside_quantized_region.cpp
@@ -30,10 +30,10 @@ bool is_op_in_quantized_region(graphlib::OpNode *op)
     return std::find(int_types.begin(), int_types.end(), op->output_df()) != int_types.end();
 }
 
-static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape> find_downward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op) {
+static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape> find_downward_path_out(graphlib::Graph *graph, graphlib::OpNode *initial_op, graphlib::OpNode *current_node=nullptr) {
     std::vector<graphlib::Edge> users_outside;
 
-    graphlib::OpNode *iter = initial_op;
+    graphlib::OpNode *iter = current_node == nullptr ? initial_op : current_node;
 
     auto clone_shape = initial_op->shape();
     auto commute_shape = shape_of_only_operand(graph, initial_op);
@@ -45,11 +45,31 @@ static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape>
 
         // For now if there are multiple children then dont commute
         std::vector<graphlib::Edge> user_edges = graph->user_data_edges(op);
-        if (user_edges.size() > 1 and op->op_name() != "buda_dequantize")
-            break;
 
         graphlib::Edge user_edge = user_edges[0];
-        
+        bool all_forks_have_path_out = true;
+        for (int user_idx = 1; user_idx < (int)user_edges.size(); user_idx++) {
+            graphlib::OpNode *user = dynamic_cast<graphlib::OpNode *>(graph->node_by_id(user_edges[user_idx].consumer_node_id));
+            
+            if (not user) {
+                all_forks_have_path_out = false;
+                break;
+            }
+            else {
+                auto fork_edges = std::get<0>(find_downward_path_out(graph, initial_op, user));
+                if (fork_edges.size() == 0)
+                {
+                    all_forks_have_path_out = false;
+                    break;
+                }
+                users_outside.insert(users_outside.end(), fork_edges.begin(), fork_edges.end());
+            }
+        }
+        if (not all_forks_have_path_out) {
+            users_outside.clear();
+            break;
+        }
+            
         // For now, if there are any edge tms just dont commute
         if (op != initial_op) {
             std::vector<graphlib::OpType> tms = graph->get_edge_attributes(user_edge)->get_tms();
@@ -58,7 +78,6 @@ static std::tuple<std::vector<graphlib::Edge>, graphlib::Shape, graphlib::Shape>
             }
         }
 
-
         bool can_commute = can_commute_past_op(op, initial_op, graph, &commute_shape, &clone_shape, false);
         if (not can_commute and op != initial_op) {
             break;
diff --git a/pybuda/csrc/passes/insert_qdq_on_biases.cpp b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
index 7644118e..099f25d0 100644
--- a/pybuda/csrc/passes/insert_qdq_on_biases.cpp
+++ b/pybuda/csrc/passes/insert_qdq_on_biases.cpp
@@ -24,6 +24,9 @@ bool can_insert_on_conv2d_bias(graphlib::Graph *graph, graphlib::OpNode *conv2d)
     if (graph->data_operands(conv2d).size() != 3)
         return false;
 
+    if (env_as<bool>("PYBUDA_DISABLE_CONV_BIAS_QDQ_INSERTION"))
+        return false;
+
     // Both act and weight must have a dequant node as input and the bias cannot
     graphlib::OpNode *act = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[0]);
     graphlib::OpNode *weight = dynamic_cast<graphlib::OpNode *>(graph->data_operands(conv2d)[1]);
diff --git a/pybuda/csrc/passes/make_quantized_ops.cpp b/pybuda/csrc/passes/make_quantized_ops.cpp
index b0446d53..f166afa1 100644
--- a/pybuda/csrc/passes/make_quantized_ops.cpp
+++ b/pybuda/csrc/passes/make_quantized_ops.cpp
@@ -6,6 +6,7 @@
 
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
+#include "passes/commute_utils.hpp"
 #include "utils/logger.hpp"
 #include "python_bindings_common.hpp"
 #include "graph_lib/node.hpp"
@@ -298,6 +299,7 @@ void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
         dequant->set_shape(conv2d->shape());
         insert_node_on_edge(graph, consumer_edge, dequant);
         graph->add_edge(scale_multiply, dequant);
+        dequant->set_output_df(DataFormat::Float32);
     }
 
     // Remove scale edges so that bypass node works (it requires that the node has one operand)
@@ -316,6 +318,62 @@ void make_quantized_conv2d(graphlib::Graph *graph, graphlib::OpNode *conv2d) {
     conv2d->set_output_df(DataFormat::Int32);
 }
 
+
+void separate_conv2d_bias(graphlib::Graph *graph) {
+    bool attempt_update = true;
+    while (attempt_update) {
+        attempt_update = false;
+        for (tt::graphlib::Node *node : graphlib::topological_sort(*graph)) {
+            graphlib::OpNode *op_node = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op_node)
+                continue;
+            if (op_node->op_name() != "conv2d" and op_node->op_name() != "conv2d_transpose")
+                continue;
+
+            if (graph->data_operands(op_node).size() < 3)
+                continue;
+
+            graphlib::Node *bias = graph->data_operands(op_node)[2];
+            graphlib::Edge bias_edge = graph->operand_data_edges(op_node)[2];
+            
+
+            uint32_t out_channels = bias->shape()[0];
+            uint32_t r_bcast = op_node->shape()[-2];
+            uint32_t c_bcast = op_node->shape()[-1];
+            std::vector<graphlib::OpType> tms;
+            if (r_bcast > 1) {
+                tms.push_back(graphlib::OpType("broadcast", {(int)-2, (int)r_bcast, true}));
+            }
+
+            if (c_bcast > 1) {
+                tms.push_back(graphlib::OpType("broadcast", {(int)-1, (int)c_bcast, true}));
+            }
+
+            graphlib::Shape new_bias_shape = graphlib::Shape::create({1, out_channels, 1, 1});
+
+            std::string reshape_name = op_node->name() + "_separate_bias_rank_match" + std::to_string(bias_edge.edge_creation_id);
+            graphlib::OpNode *reshape = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(reshape_name, "reshape"), graph->get_subgraph_id_for_node(node->id()));
+            update_reshape_attr(reshape, new_bias_shape);
+            reshape->set_shape(new_bias_shape);
+            graph->add_edge(bias, reshape);
+
+            for (graphlib::Edge user_edge : graph->user_data_edges(op_node)) {
+                std::string add_name = op_node->name() + "_separate_bias_add" + std::to_string(user_edge.edge_creation_id);
+                graphlib::OpNode *add = graph->add_node<graphlib::OpNode>(graphlib::create_node<graphlib::PyOpNode>(add_name, "add"), graph->get_subgraph_id_for_node(node->id()));
+                graph->add_edge(reshape, add);
+                graphlib::Edge new_edge = graph->operand_data_edges(add)[0];
+                graph->get_edge_attributes(new_edge)->set_tms(tms);
+                add->set_shape(op_node->shape());
+                insert_node_on_edge(graph, user_edge, add);
+                add->set_output_df(bias->output_df());
+            }
+            graph->remove_edge(bias_edge);
+            attempt_update = true;
+
+        }
+    }   
+}
+
 const std::array<std::string, 4> quantizeable_ops{
     "matmul",
     "conv2d",
diff --git a/pybuda/csrc/passes/make_quantized_ops.hpp b/pybuda/csrc/passes/make_quantized_ops.hpp
index d2fa7123..9868194b 100644
--- a/pybuda/csrc/passes/make_quantized_ops.hpp
+++ b/pybuda/csrc/passes/make_quantized_ops.hpp
@@ -11,4 +11,5 @@ class Graph;
 namespace tt::passes
 {
 bool make_quantized_ops(graphlib::Graph *graph);
+void separate_conv2d_bias(graphlib::Graph *graph);
 }
\ No newline at end of file

From 24fd232ef915ecfc13fc2cfc5d10e76a10d68127 Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <vobojevic@tenstorrent.com>
Date: Tue, 13 Aug 2024 14:11:15 +0000
Subject: [PATCH 082/116] Fix 'Limitter' typo

(cherry picked from commit 737b2f96e0673068161229f53b721c4d119420ec)
---
 .../eltwise_binary/test_pytorch_binary.py     |  4 +--
 pybuda/test/random/rgg/__init__.py            |  4 +--
 pybuda/test/random/rgg/algorithms.py          | 30 +++++++++----------
 pybuda/test/random/rgg/utils.py               |  5 ++--
 4 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
index 24dc2d4e..3cf132b1 100644
--- a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
+++ b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
@@ -21,7 +21,7 @@
 from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
 from test.conftest import TestDevice
-from test.random.rgg import RateLimitter
+from test.random.rgg import RateLimiter
 
 
 class ModelFromAnotherOp(torch.nn.Module):
@@ -393,7 +393,7 @@ def test_pytorch_eltwise_binary_ops_per_test_plan(
 
 
 rng_limiter = random.Random(0)
-kwargs_limiter = RateLimitter(rng_limiter, 100, 50)
+kwargs_limiter = RateLimiter(rng_limiter, 100, 50)
 
 
 def get_not_implemented_pytorch_binary_ops():
diff --git a/pybuda/test/random/rgg/__init__.py b/pybuda/test/random/rgg/__init__.py
index 3d3177ef..995e3076 100644
--- a/pybuda/test/random/rgg/__init__.py
+++ b/pybuda/test/random/rgg/__init__.py
@@ -11,7 +11,7 @@
 from .config import get_randomizer_config_default
 from .utils import StrUtils, GraphUtils
 from .utils import DebugUtils
-from .utils import RateLimitter
+from .utils import RateLimiter
 from .base import Framework, GraphBuilder, ModelBuilder
 from .base import RandomizerRunner, RandomizerCodeGenerator, process_test
 from .frameworks import Frameworks
@@ -34,7 +34,7 @@
     "StrUtils",
     "GraphUtils",
     "DebugUtils",
-    "RateLimitter",
+    "RateLimiter",
     "Framework",
     "GraphBuilder",
     "ModelBuilder",
diff --git a/pybuda/test/random/rgg/algorithms.py b/pybuda/test/random/rgg/algorithms.py
index e5c85127..983ee7b4 100644
--- a/pybuda/test/random/rgg/algorithms.py
+++ b/pybuda/test/random/rgg/algorithms.py
@@ -17,7 +17,7 @@
 from .base import RandomizerNode, GraphBuilder
 from .base import Framework
 from .utils import RandomUtils, StrUtils, NodeUtils
-from .utils import RateLimitter
+from .utils import RateLimiter
 from .shapes import AdjustParameters
 
 
@@ -73,8 +73,8 @@ def init_nodes_inputs(cls, test_context: RandomizerTestContext):
 
         rng_shape = test_context.rng_shape
 
-        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
-        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+        constant_input_rate_limiter = RateLimiter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
+        same_inputs_rate_limiter = RateLimiter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
 
         logger.trace("Setting input nodes for open nodes")
         open_nodes = NodeUtils.get_open_nodes(nodes)
@@ -89,10 +89,10 @@ def init_nodes_inputs(cls, test_context: RandomizerTestContext):
                 input_shape = input_shapes[open_input_index]
 
                 # There must be at least one input node for forward method
-                if len(graph.input_nodes) > 0 and constant_input_rate_limitter.is_allowed():
+                if len(graph.input_nodes) > 0 and constant_input_rate_limiter.is_allowed():
                     # Creates a new constant node with the same shape
                     constant_node = RandomizerConstantNode(out_value=None, input_shape=input_shape)
-                    logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limitter.limit_info()}")
+                    logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limiter.limit_info()}")
                     # Stores the new constant node in the graph constant nodes
                     graph.constant_nodes.append(constant_node)
                     input_node = constant_node
@@ -112,13 +112,13 @@ def init_nodes_inputs(cls, test_context: RandomizerTestContext):
                         allow_repeat = len(input_nodes_with_same_shape) > 0
 
                         if allow_repeat:
-                            if not same_inputs_rate_limitter.is_allowed():
-                                logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                            if not same_inputs_rate_limiter.is_allowed():
+                                logger.trace(f"Not allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limiter.limit_info()}")
                                 allow_repeat = False
 
                         if allow_repeat:
                             input_node = rng_shape.choice(input_nodes_with_same_shape)
-                            logger.trace(f"Allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                            logger.trace(f"Allowed same input value {input_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limiter.limit_info()}")
                         
                         else:
                             # create a new input node with the same shape since there are no unused input nodes with the same shape or repeat is not allowed
@@ -257,8 +257,8 @@ def build_graph(self, test_context: RandomizerTestContext):
         fork_join_counter = 0
         fork_join_max = test_context.randomizer_config.num_fork_joins_max
 
-        constant_input_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
-        same_inputs_rate_limitter = RateLimitter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
+        constant_input_rate_limiter = RateLimiter(rng_shape, 100, test_context.randomizer_config.constant_input_rate)
+        same_inputs_rate_limiter = RateLimiter(rng_shape, 100, test_context.randomizer_config.same_inputs_percent_limit)
 
         # Context object for shape calculation, node will be set later in the loop
         shape_calculation_context = NodeShapeCalculationContext(node=None, test_context=test_context)
@@ -366,11 +366,11 @@ def build_graph(self, test_context: RandomizerTestContext):
 
                         # Limit number of same inputs on same node
                         if node_connected:
-                            if not same_inputs_rate_limitter.is_allowed():
-                                logger.trace(f"Skipping same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limitter.limit_info()}")
+                            if not same_inputs_rate_limiter.is_allowed():
+                                logger.trace(f"Skipping same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit exceeded: {same_inputs_rate_limiter.limit_info()}")
                                 continue
                             else:
-                                logger.trace(f"Allowed same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limitter.limit_info()}")
+                                logger.trace(f"Allowed same input node connection op{node_index} {node.name} -> {closing_node.name}[{open_input_index}] due to rate limit not exceeded: {same_inputs_rate_limiter.limit_info()}")
                         closing_node.inputs[open_input_index] = node
                         node_connected = True
 
@@ -384,10 +384,10 @@ def build_graph(self, test_context: RandomizerTestContext):
                 input_shape = input_shapes[open_input_index]
                 # Skip connecting constant input for last open input to avoid disconnected graph
                 if open_nodes_count > 1 or NodeUtils.num_of_open_inputs(node) > 1:
-                    if constant_input_rate_limitter.is_allowed():
+                    if constant_input_rate_limiter.is_allowed():
                         # Creates a new constant node with the same shape
                         constant_node = RandomizerConstantNode(out_value=None, input_shape=input_shape)
-                        logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limitter.limit_info()}")
+                        logger.trace(f"Allowed constant input {constant_node.out_value} -> {node.name}[{open_input_index}] due to rate limit not exceeded: {constant_input_rate_limiter.limit_info()}")
                         # Stores the new constant node in the graph constant nodes
                         graph.constant_nodes.insert(0, constant_node)
                         # Connects the input node to the open node input
diff --git a/pybuda/test/random/rgg/utils.py b/pybuda/test/random/rgg/utils.py
index 5224055c..8ccc67a9 100644
--- a/pybuda/test/random/rgg/utils.py
+++ b/pybuda/test/random/rgg/utils.py
@@ -257,9 +257,8 @@ def debug_inputs(cls, inputs: List[pybuda.Tensor]):
         logger.info(f"inputs: {cls.format_tensors(inputs)}")
 
 
-# TODO: rename to RateLimiter
-class RateLimitter:
-    '''Rate limitter class to limit the number of allowed operations by a rate limit factor'''
+class RateLimiter:
+    '''Rate limiter class to limit the number of allowed operations by a rate limit factor'''
 
     def __init__(self, rng: random.Random, max_limit: int, current_limit: int):
         self.rng = rng

From 3dcc0a023b60efb97eea4218efe82f30109327aa Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Mon, 12 Aug 2024 13:31:17 +0000
Subject: [PATCH 083/116] [TVM][Relay]Refactor Where operator to avoid
 generation of NaN tensors and add sanity test

(cherry picked from commit 4df25b71805e30541593db2219b8f54ada67b2e4)
---
 .../tvm/sanity/tests_A/test_sanity_pytorch.py | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
index 0736442c..c59fd8e8 100644
--- a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
+++ b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
@@ -41,6 +41,9 @@
 linear_features_in = [64]
 linear_features_out = [64]
 
+import onnx
+import onnxruntime
+import scipy.stats
 
 @pytest.mark.parametrize(
     "input_shape", input_shapes, ids=[f"input{str(s)}" for s in input_shapes]
@@ -3793,3 +3796,75 @@ def forward(self, Q, K, V, attn_mask):
         )
     )
 
+@pytest.mark.parametrize("mask", [
+    torch.tensor([[0.0, 0.0, 0.1, 0.2, 0.3, 0.4]]).to(torch.float32),
+    torch.tensor([[0.00, 0.00, 0.02, 0.07, 0.03, 0.04]]).to(torch.float32),
+    torch.tensor([[0.000, 0.000, 0.002, 0.007, 0.003, 0.004]]).to(torch.float32),
+    torch.tensor([[0, 0, torch.finfo(torch.float32).min, -torch.finfo(torch.float32).min, torch.finfo(torch.float32).min, -torch.finfo(torch.float32).min]]).to(torch.float32),
+])
+
+def test_where(test_device, mask):
+    
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+    
+    class Masked_fill(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            
+        def forward(self, input_tensor, mask):
+            output = input_tensor.masked_fill(mask.bool(), torch.finfo(torch.float32).min)
+            return output
+    
+    input_tensor = torch.tensor([[0, 0, 0, 0, torch.finfo(torch.float32).min, 0]]).to(torch.float32)
+    model = Masked_fill()
+    model.eval()
+
+    # pytorch inference
+    torch_output = model(input_tensor,mask)
+    model_file = "sanity_where_onnx.onnx"
+    
+    # Export to ONNX model
+    torch.onnx.export(
+        model,
+        (input_tensor,mask),
+        model_file,
+        export_params=True,
+        opset_version=16,
+        do_constant_folding=True,
+        input_names=['input_tensor','mask'],
+        output_names=['output'],
+
+    )
+    
+    onnx_model = onnx.load(model_file)
+    onnx.checker.check_model(onnx_model)
+    
+    tt_model = pybuda.OnnxModule('sanity_where_onnx',onnx_model, model_file)
+
+    # ONNX inference
+    ort_session = onnxruntime.InferenceSession(model_file)
+    onnx_input = {"input_tensor": input_tensor.numpy(),"mask": mask.numpy()}
+    onnx_output = ort_session.run(None, onnx_input)
+
+    # pcc
+    for pt, on in zip(torch_output, onnx_output):
+        correlation_coefficient, _ = scipy.stats.pearsonr(pt.detach().numpy().reshape(-1), on.reshape(-1))
+        print("torch vs onnx pcc = ",correlation_coefficient)
+    
+    verify_module(
+        tt_model,
+        input_shapes=[(input_tensor.shape, mask.shape, )],
+        inputs=[(input_tensor, mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_all=True,
+        )
+    )
+    
+    os.remove(model_file)

From 959a28f5603fb4150cddf9596c3fbe7d38c66906 Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Wed, 14 Aug 2024 09:17:58 +0000
Subject: [PATCH 084/116] [TVM][Relay] Update the elu pytorch frontend
 implementation to handle inf values from exp and add sanity test

(cherry picked from commit d83765b294a1aa8c920eddc15b90f151f88a7604)
---
 .../sanity/tests_B/test_pattern_matcher.py    | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
index 39a566c0..fa5c1e7b 100644
--- a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
+++ b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
@@ -247,3 +247,43 @@ def forward(self, input_tensor):
         ),
     )
 
+
+@pytest.mark.parametrize("alpha_val", (1.0, 0.367, 4.675, -8.0, -0.6743, 16.296))
+def test_elu_pytorch(test_device, alpha_val):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    class elu_model(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.elu = torch.nn.ELU(alpha=alpha_val)
+        def forward(self, input_tensor):
+            return self.elu(input_tensor)
+
+    model = elu_model()
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(
+        "pt_elu", model
+    )
+
+    input_sample = torch.randint(-200, 200, (1, 3, 512, 512)).to(torch.float32)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(input_sample.shape,)],
+        inputs=[(input_sample,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+        ),
+    )

From 21ed49ec37d4335418ff1659c9e240b2f0ee373b Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Wed, 14 Aug 2024 11:41:49 +0000
Subject: [PATCH 085/116] Cherry-pick Github PR: README.md: Update docs link

(cherry picked from commit cbb7c50eca10dfaa82d0edf7f20fc8aee28dfdf4)
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e3d00baf..b89721dc 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ See: [Docs](https://docs.tenstorrent.com/tenstorrent/v/tt-buda)
 
 ## Build
 
-https://docs.tenstorrent.com/tenstorrent/v/tt-buda/installation
+See: [TT-Buda Installation](https://github.com/tenstorrent/tt-buda-demos/blob/main/first_5_steps/1_install_tt_buda.md)
 
 ## Env setup
 

From fe7df7a4c471fa9be702629e3bb9e67dbc465462 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Fri, 16 Aug 2024 06:39:43 +0000
Subject: [PATCH 086/116] [TVM][Relay] Add support for diagonal parameter in
 tril function

(cherry picked from commit d8ddf8f7c311755482a32634d8419c67a18ebcc3)
---
 .../tvm/sanity/tests_A/test_sanity_pytorch.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
index c59fd8e8..b0c55706 100644
--- a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
+++ b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
@@ -3868,3 +3868,40 @@ def forward(self, input_tensor, mask):
     )
     
     os.remove(model_file)
+
+@pytest.mark.parametrize("diagonal", [1, 0, -1, 2, -2047, 1024])
+def test_tril(diagonal, test_device):
+
+    class Tril(torch.nn.Module):
+        def __init__(self, diagonal):
+            super().__init__()
+            self.diagonal = diagonal
+
+        def forward(self, mask):
+            mask = mask[0]
+            context_mask = torch.tril(mask, diagonal=self.diagonal)
+            return context_mask
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # Define the input tensor and model
+    mask = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).to(torch.float32)
+    model = Tril(diagonal)
+    model.eval()
+    mask = mask.unsqueeze(0)
+
+    verify_module(
+        pybuda.PyTorchModule("tril", model),
+        input_shapes=[(mask.shape,)],
+        inputs=[(mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_all=True,
+        ),
+    )

From ccad7e41fd83fbfe84f380a9a10c6e759bc60314 Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Tue, 2 Jul 2024 19:29:22 +0000
Subject: [PATCH 087/116] [Blackhole] Fix issue when generating cluster
 descriptor

(cherry picked from commit 61ee86010cb56c14aa11cb24e7d78cc48f3cb705)
---
 pybuda/csrc/backend_api/device_config.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/csrc/backend_api/device_config.hpp b/pybuda/csrc/backend_api/device_config.hpp
index 208df332..465c70d4 100644
--- a/pybuda/csrc/backend_api/device_config.hpp
+++ b/pybuda/csrc/backend_api/device_config.hpp
@@ -102,7 +102,7 @@ struct DeviceConfig
             return;
 
         // Get backend related parameters
-        if (this->is_wormhole_b0())
+        if (arch == ARCH::WORMHOLE_B0)
         {
             // Load and cache system-level params if needed
             if (this->backend_type == "silicon")

From f3d9f43f3ecc22aba73669785e245438a7dd9781 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Tue, 13 Aug 2024 19:35:23 +0000
Subject: [PATCH 088/116] fix names and update output_host_tms dict

The node names need to be updated after data parallel operations.

(cherry picked from commit 5396712e4a7c01b54e8d2c92a05528f8abac450a)
---
 pybuda/pybuda/compiled_graph_state.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pybuda/pybuda/compiled_graph_state.py b/pybuda/pybuda/compiled_graph_state.py
index c00e3962..2a6854c6 100644
--- a/pybuda/pybuda/compiled_graph_state.py
+++ b/pybuda/pybuda/compiled_graph_state.py
@@ -190,7 +190,8 @@ def from_compiled_graph(device: "TTDevice", compile_results: CompileResults) ->
             ordered_parameter_node_names
         )
 
-        if os.getenv("PYBUDA_N300_DATA_PARALLEL", 0):
+        is_data_parallel = int(os.getenv("PYBUDA_N300_DATA_PARALLEL", "0"))
+        if is_data_parallel:
             def replicate_items(items_to_replicate):
                 dev0 = { f"{k}.0" : v for k,v in items_to_replicate.items() }
                 dev1 = { f"{k}.1" : v for k,v in items_to_replicate.items() }
@@ -219,6 +220,16 @@ def replicate_items(items_to_replicate):
             logger.debug("ordered_output_names = {}", ordered_output_names)
             logger.debug("ordered_output_shapes = {}", ordered_output_shapes)
 
+        if is_data_parallel:
+            real_output_host_tms = {}
+            for key, val in compile_results.pass_specific_output_kwargs["output_host_tms"].items():
+                if "_dp_out_1" in key:
+                    search_key = key.split("_dp_out_1")[0]
+                    if search_key in compile_results.pass_specific_output_kwargs["output_host_tms"].keys():
+                        real_output_host_tms[f"{search_key}.0"] = compile_results.pass_specific_output_kwargs["output_host_tms"][search_key]
+                        real_output_host_tms[f"{search_key}.1"] = val
+            compile_results.pass_specific_output_kwargs["output_host_tms"].update(real_output_host_tms)
+
         return CompiledGraphState(
             microbatch=graph.get_microbatch(),
             graph_name=graph.get_name(),

From c635d5d8346d1884530a0a4ae51ae3ef22eddc92 Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Thu, 15 Aug 2024 09:26:26 +0000
Subject: [PATCH 089/116] Remove flax dependency from Pybuda

(cherry picked from commit 7b2d4b0ce777488e155f6a625c3693a5a68ae1e0)
---
 .../high_prio/cnn/pytorch/test_vit.py         |   7 +-
 .../tvm/sanity/tests_C/test_sanity_jax.py     | 136 ++++++++++--------
 python_env/core_requirements.txt              |   1 -
 3 files changed, 82 insertions(+), 62 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
index ac21343d..63a0a6ed 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
@@ -116,8 +116,11 @@ def test_vit_classification_1x1_demo(test_device, mode, variant):
         pytest.skip("Not supported")
 
     # Setup for 1x1 grid
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
-    
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/pybuda/test/tvm/sanity/tests_C/test_sanity_jax.py b/pybuda/test/tvm/sanity/tests_C/test_sanity_jax.py
index 71f28ab7..e1d81533 100644
--- a/pybuda/test/tvm/sanity/tests_C/test_sanity_jax.py
+++ b/pybuda/test/tvm/sanity/tests_C/test_sanity_jax.py
@@ -6,7 +6,7 @@
 
 import jax
 from jax import numpy as jnp
-from flax import linen as nn
+# from flax import linen as nn
 
 from pybuda import (
     JaxModule,
@@ -17,6 +17,8 @@
 from pybuda.config import CompileDepth, _get_global_compiler_config
 
 def test_tvm_linear(test_kind, test_device):
+    pytest.skip()
+
     class Linear(nn.Module):
         features: Sequence[int]
 
@@ -57,6 +59,8 @@ def __call__(self, x):
 
 
 def test_tvm_linear_relu(test_kind, test_device):
+    pytest.skip()
+
     class Linear(nn.Module):
         features: Sequence[int]
 
@@ -101,6 +105,8 @@ def __call__(self, x):
 
 
 def test_tvm_multiple_outputs(test_kind, test_device):
+    pytest.skip()
+
     if test_kind.is_training():
         pytest.skip()
 
@@ -150,6 +156,8 @@ def __call__(self, x):
 
 
 def test_tvm_scaled_dot_product_attention(test_kind, test_device):
+    pytest.skip()
+
     class ScaledDotProductAttention(nn.Module):
         def __call__(self, q, k, v):
             kt = jnp.transpose(k, axes=[0, 2, 1])
@@ -200,6 +208,8 @@ def __call__(self, q, k, v):
 
 
 def test_tvm_layer_norm(test_kind, test_device):
+    pytest.skip()
+
     # TODO: Checkout why recompute fails.
     if test_kind == TestKind.TRAINING_RECOMPUTE:
         pytest.skip()
@@ -248,6 +258,8 @@ def __call__(self, x):
 
 
 def test_tvm_conv2d(test_kind, test_device):
+    pytest.skip()
+
     class Conv2d(nn.Module):
         @nn.compact
         def __call__(self, x):
@@ -296,73 +308,75 @@ def __call__(self, x):
         )
     )
 
-class XLAGatherModule1(nn.Module):
-    def __call__(self, x):
-        return x[:, :, :, 0]
+# class XLAGatherModule1(nn.Module):
+#     def __call__(self, x):
+#         return x[:, :, :, 0]
 
-class XLAGatherModule2(nn.Module):
-    def __call__(self, x):
-        return x[:, :, 0:3, 0]
+# class XLAGatherModule2(nn.Module):
+#     def __call__(self, x):
+#         return x[:, :, 0:3, 0]
 
-class XLAGatherModule3(nn.Module):
-    def __call__(self, x):
-        return x[:, :, 3:9, 0]
+# class XLAGatherModule3(nn.Module):
+#     def __call__(self, x):
+#         return x[:, :, 3:9, 0]
 
-class XLAGatherModule4(nn.Module):
-    def __call__(self, x):
-        return x[:, 3:9, :, 0]
+# class XLAGatherModule4(nn.Module):
+#     def __call__(self, x):
+#         return x[:, 3:9, :, 0]
     
-class XLAGatherModule5(nn.Module):
-    def __call__(self, x):
-        return x[:, 3:9, :, 2:4]
+# class XLAGatherModule5(nn.Module):
+#     def __call__(self, x):
+#         return x[:, 3:9, :, 2:4]
     
-class XLAGatherModule6(nn.Module):
-    def __call__(self, x):
-        return x[:, 3:9, 1:6, 2:4]
-
-@pytest.mark.parametrize("slice_module", (XLAGatherModule1, XLAGatherModule2, XLAGatherModule3, XLAGatherModule4, XLAGatherModule5, XLAGatherModule6,))
-def test_tvm_xla_gather(test_kind, test_device, slice_module):
-    if test_kind.is_training():
-        pytest.skip()
+# class XLAGatherModule6(nn.Module):
+#     def __call__(self, x):
+#         return x[:, 3:9, 1:6, 2:4]
+
+# @pytest.mark.parametrize("slice_module", (XLAGatherModule1, XLAGatherModule2, XLAGatherModule3, XLAGatherModule4, XLAGatherModule5, XLAGatherModule6,))
+# def test_tvm_xla_gather(test_kind, test_device, slice_module):
+#     if test_kind.is_training():
+#         pytest.skip()
     
-    if slice_module in [XLAGatherModule3, XLAGatherModule4, XLAGatherModule5, XLAGatherModule6]:
-        # tenstorrent/pybuda#1608
-        pytest.skip()
-
-    compiler_config = _get_global_compiler_config()
-    if not test_kind.is_training():
-        compiler_config.compile_depth = CompileDepth.FULL
-    else:
-        compiler_config.compile_depth = CompileDepth.FULL
-    compiler_config.retain_tvm_python_files = True
-    compiler_config.enable_xla_jax_convert = True
-
-    # Initialize module
-    input_shape = (1, 28, 28, 4)
-    framework_module = slice_module()
-
-    # Bind params to module
-    key = jax.random.PRNGKey(0)
-    act = jax.random.uniform(key, input_shape)
-    vars = framework_module.init(key, act)
-    framework_module = framework_module.bind(vars)
-
-    # Run module
-    # res = framework_module(act)
-
-    pybuda_module = JaxModule("jax_xla_gather", framework_module)
-    verify_module(
-        pybuda_module,
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-        )
-    )
+#     if slice_module in [XLAGatherModule3, XLAGatherModule4, XLAGatherModule5, XLAGatherModule6]:
+#         # tenstorrent/pybuda#1608
+#         pytest.skip()
+
+#     compiler_config = _get_global_compiler_config()
+#     if not test_kind.is_training():
+#         compiler_config.compile_depth = CompileDepth.FULL
+#     else:
+#         compiler_config.compile_depth = CompileDepth.FULL
+#     compiler_config.retain_tvm_python_files = True
+#     compiler_config.enable_xla_jax_convert = True
+
+#     # Initialize module
+#     input_shape = (1, 28, 28, 4)
+#     framework_module = slice_module()
+
+#     # Bind params to module
+#     key = jax.random.PRNGKey(0)
+#     act = jax.random.uniform(key, input_shape)
+#     vars = framework_module.init(key, act)
+#     framework_module = framework_module.bind(vars)
+
+#     # Run module
+#     # res = framework_module(act)
+
+#     pybuda_module = JaxModule("jax_xla_gather", framework_module)
+#     verify_module(
+#         pybuda_module,
+#         (input_shape,),
+#         verify_cfg=VerifyConfig(
+#             arch=test_device.arch,
+#             devtype=test_device.devtype,
+#             test_kind=test_kind,
+#         )
+#     )
 
 
 def test_tvm_dense(test_kind, test_device):
+    pytest.skip()
+
     class JAX_dense(nn.Module):
         @nn.compact
         def __call__(self, x):
@@ -399,6 +413,8 @@ def __call__(self, x):
     )
 
 def test_tvm_conv2d_transpose(test_kind, test_device):
+    pytest.skip()
+
     class Conv2d(nn.Module):
         @nn.compact
         def __call__(self, img):
@@ -445,6 +461,8 @@ def __call__(self, img):
     )
 
 def test_tvm_conv2d_dilated(test_kind, test_device):
+    pytest.skip()
+
     class Conv2d(nn.Module):
         @nn.compact
         def __call__(self, img):
diff --git a/python_env/core_requirements.txt b/python_env/core_requirements.txt
index bc0b31c4..0442bbb0 100644
--- a/python_env/core_requirements.txt
+++ b/python_env/core_requirements.txt
@@ -13,7 +13,6 @@ dill==0.3.5.1
 flatbuffers==23.5.26
 # This is needed to prevent AttributeError: module 'ml_dtypes' has no attribute 'float8_e4m3b11'
 ml-dtypes==0.2.0
-flax==0.6.0
 jax==0.4.13
 jaxlib==0.4.11
 loguru==0.5.3

From 0d7622471a44f1311d1740f7bd0102fa5ce46962 Mon Sep 17 00:00:00 2001
From: jserbedzija <jserbedzija@tenstorrent.com>
Date: Fri, 16 Aug 2024 20:36:11 +0000
Subject: [PATCH 090/116] [Blackhole] Fix functional models in nightly run

(cherry picked from commit e24334bba567b782ff5fbfd03a9cc6f89c16969c)
---
 .../high_prio/cnn/onnx/test_ddrnet.py         |  5 ++
 .../high_prio/cnn/onnx/test_dla.py            |  6 +-
 .../high_prio/cnn/onnx/test_hardnet.py        |  4 ++
 .../cnn/onnx/test_perceiverio_learned.py      |  6 ++
 .../high_prio/cnn/onnx/test_retinanet.py      |  4 ++
 .../cnn/onnx/test_segformer_imgcls_1.py       |  5 ++
 .../high_prio/cnn/onnx/test_yolo_v5.py        | 41 +++++++++++
 .../high_prio/cnn/onnx/test_yolo_x.py         | 64 ++++++++++++++++-
 .../high_prio/cnn/pytorch/test_blazepose.py   | 13 +++-
 .../cnn/pytorch/test_efficientnet.py          |  5 ++
 .../cnn/pytorch/test_efficientnet_lite.py     |  2 +-
 .../high_prio/cnn/pytorch/test_hardnet.py     |  8 +++
 .../cnn/pytorch/test_inception_v4.py          | 14 ++++
 .../cnn/pytorch/test_mobilenet_v1_ssd.py      |  6 +-
 .../high_prio/cnn/pytorch/test_monodle.py     |  2 +
 .../cnn/pytorch/test_perceiverio_conv.py      |  3 +
 .../cnn/pytorch/test_perceiverio_fourier.py   | 14 ++++
 .../cnn/pytorch/test_perceiverio_learned.py   |  7 ++
 .../high_prio/cnn/pytorch/test_pidnet.py      | 30 ++++++++
 .../cnn/pytorch/test_segformer_imgcls_1.py    |  4 ++
 .../cnn/pytorch/test_ssd300_resnet50.py       |  4 ++
 .../high_prio/cnn/pytorch/test_vit.py         | 16 ++++-
 .../high_prio/cnn/pytorch/test_yolo_v3.py     |  7 +-
 .../high_prio/cnn/pytorch/test_yolo_v5.py     | 39 +++++++++--
 .../high_prio/cnn/pytorch/test_yolo_v6.py     |  4 +-
 .../high_prio/cnn/pytorch/test_yolo_x.py      | 68 +++++++++++++++++++
 .../cnn/tflite/test_efficientnet_lite.py      | 14 +++-
 .../cnn/tflite/test_hand_landmarker.py        | 14 +++-
 .../cnn/tflite/test_mobilenet_ssd.py          |  7 +-
 .../cnn/tflite/test_pose_landmark.py          | 14 +++-
 .../high_prio/nlp/pytorch/test_bert.py        |  3 +
 .../high_prio/nlp/pytorch/test_codegen.py     |  6 ++
 .../high_prio/nlp/pytorch/test_gemma_2b.py    |  7 +-
 .../high_prio/nlp/pytorch/test_gptneo.py      |  4 ++
 .../high_prio/nlp/pytorch/test_phi2.py        |  4 ++
 .../high_prio/nlp/pytorch/test_t5.py          | 14 ++--
 .../high_prio/nlp/pytorch/test_whisper_1.py   | 31 +++++++++
 .../high_prio/nlp/pytorch/test_xglm.py        |  5 +-
 38 files changed, 470 insertions(+), 34 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
index fa30bdec..7385874f 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
@@ -108,6 +108,11 @@ def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "24576"
             compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11915.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 32))
 
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{32*1024}"
+        compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+        compiler_cfg.balancer_op_override("conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone925.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 8))
+
     # Load and validate the model
     if variant == "ddrnet_23_slim_1024":
         load_path = f"third_party/confidential_customer_models/customer/model_0/files/cnn/ddrnet/{variant}.onnx"
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
index 8786465b..5d30f0e6 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
@@ -12,6 +12,7 @@
 from pybuda._C.backend_api import BackendDevice
 import torchvision.transforms as transforms
 from PIL import Image
+import shutil
 
 
 variants = [
@@ -65,7 +66,7 @@ def test_dla_onnx(test_device, variant):
     tt_model = pybuda.OnnxModule(model_name, onnx_model, onnx_model_path)
 
     pcc = 0.99
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    if test_device.arch in [BackendDevice.Wormhole_B0, BackendDevice.Blackhole]:
         if variant == "dla34":
             pcc = 0.98
         elif variant == "dla169":
@@ -93,4 +94,5 @@ def test_dla_onnx(test_device, variant):
 
     # Cleanup model files
     os.remove(onnx_model_path)
-    os.rmdir(onnx_dir_path)
+    # os.rmdir(onnx_dir_path)
+    shutil.rmtree(onnx_dir_path)
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
index bc784d3f..80754717 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
@@ -28,6 +28,10 @@ def test_hardnet_onnx(variant, test_device):
     if variant == "hardnet85" and test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        if variant == "hardnet85":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{42*1024}"
+
     # Download an example image
     url, filename = (
         "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
index 1fdf91e7..1152ad33 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio_learned.py
@@ -47,6 +47,12 @@ def test_perceiverio_learned_imgcls_onnx(test_device):
         if test_device.devtype == pybuda.BackendType.Silicon:
             verify_enabled = False
 
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{140*1024}"
+        compiler_cfg.balancer_op_override("add_63", "t_stream_shape", (1, 2))
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.91
+
     onnx_model_path = (
         "third_party/confidential_customer_models/internal/perceiverio/files/onnx/"
         + str(model_name).split("/")[-1].replace("-", "_")
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
index af67f6d2..95ca6693 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
@@ -169,6 +169,10 @@ def test_retinanet_onnx(variant, test_device):
             compiler_cfg.balancer_op_override("conv2d_393.dc.matmul.11", "t_stream_shape", (1,1))
             compiler_cfg.balancer_op_override("conv2d_371.dc.matmul.11", "t_stream_shape", (1,1))
  
+    if test_device.arch == BackendDevice.Blackhole:
+        if variant == "retinanet_rn50fpn" or variant == "retinanet_rn152fpn":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{72*1024}"
+
     # Prepare model
     load_path = (
         f"third_party/confidential_customer_models/internal/retinanet/files/onnx/{variant}.onnx"
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
index 21aa3fb0..dd83a676 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls_1.py
@@ -62,6 +62,11 @@ def test_segformer_imgcls_onnx_1(test_device, variant):
             elif variant == "nvidia/mit-b2":
                 pcc_value = 0.96
 
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            if variant == "nvidia/mit-b0":
+                pcc_value = 0.97
+
     # Load the sample image
     pixel_values = get_sample_data(variant)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
index ea96f815..38341e7c 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -182,6 +182,18 @@ def test_yolo_v5_480x480_onnx(test_device, variant):
                     (1, 1),
                 )
 
+    elif test_device.arch == BackendDevice.Blackhole:
+
+        if variant == "yolov5n":
+            compiler_cfg.place_on_new_epoch("_fused_op_7")
+        elif variant == "yolov5l":
+            compiler_cfg.place_on_new_epoch("_fused_op_11")
+            compiler_cfg.place_on_new_epoch("_fused_op_12")
+            compiler_cfg.place_on_new_epoch("_fused_op_25")
+        elif variant == "yolov5x":
+            compiler_cfg.place_on_new_epoch("conv2d_44.dc.matmul.11")
+            compiler_cfg.place_on_new_epoch("_fused_op_13")
+
     input_size = 480
 
     # Load the ONNX model
@@ -291,6 +303,35 @@ def test_yolo_v5_640x640_onnx(test_device, variant):
                     (1, 1),
                 )
 
+    elif test_device.arch == BackendDevice.Blackhole:
+
+        os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+        os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+
+        if variant == "yolov5n":
+            compiler_cfg.balancer_op_override("concatenate_259.dc.concatenate.7", "grid_shape", (1, 1))
+
+        elif variant == "yolov5s":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+            compiler_cfg.balancer_op_override("concatenate_259.dc.concatenate.7", "grid_shape", (1, 1))
+
+        elif variant == "yolov5m":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+            compiler_cfg.balancer_op_override("concatenate_332.dc.concatenate.7", "grid_shape", (1, 1))
+            compiler_cfg.balancer_op_override("concatenate_332.dc.concatenate.7", "t_stream_shape", (1, 1))
+
+        elif variant == "yolov5l":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+            compiler_cfg.balancer_op_override("concatenate_405.dc.concatenate.7", "grid_shape", (1, 1))
+
+        elif variant == "yolov5x":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{374*1024}"
+            compiler_cfg.enable_auto_fusing = False
+            compiler_cfg.place_on_new_epoch("concatenate_40.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12")
+            compiler_cfg.place_on_new_epoch("concatenate_478.dc.sparse_matmul.10.lc2")
+            compiler_cfg.balancer_op_override("concatenate_478.dc.concatenate.7", "grid_shape", (1, 1))
+
     input_size = 640
 
     # Load the ONNX model
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
index c30dd092..a72a82d3 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -191,6 +191,68 @@ def test_yolox_onnx(variant, test_device):
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
                 compiler_cfg.place_on_new_epoch("concatenate_512.dc.sparse_matmul.11.lc2")
 
+    if test_device.arch == BackendDevice.Blackhole:
+
+        if variant in ["yolox_nano", "yolox_tiny"]:
+
+            compiler_cfg.place_on_new_epoch("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.14.lc2")
+            compiler_cfg.place_on_new_epoch("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.14.lc2")
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 2))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "81920"
+
+        elif variant == "yolox_s":
+
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_33.dc.matmul.8", "t_stream_shape", (1, 1))
+            compiler_cfg.place_on_new_epoch("concatenate_275.dc.sparse_matmul.11.lc2")
+
+        elif variant == "yolox_m":
+
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+            compiler_cfg.place_on_new_epoch("conv2d_187.dc.matmul.8")
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", "t_stream_shape", (5, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.place_on_new_epoch("concatenate_354.dc.sparse_matmul.11.lc2")
+
+        elif variant in ["yolox_l", "yolox_darknet", "yolox_x"]:
+
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+            if variant == "yolox_l":
+
+                os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_433.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_darknet":
+
+                compiler_cfg.balancer_op_override("conv2d_28.dc.matmul.8", "t_stream_shape", (1, 1))
+                compiler_cfg.balancer_op_override("conv2d_33.dc.matmul.8", "t_stream_shape", (1, 1))
+                compiler_cfg.place_on_new_epoch("concatenate_222.dc.sparse_matmul.11.lc2")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "53248"
+
+            elif variant == "yolox_x":
+
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_512.dc.sparse_matmul.11.lc2")
+                compiler_cfg.place_on_new_epoch("conv2d_379.dc.matmul.8")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
     # prepare input
     if variant in ["yolox_nano", "yolox_tiny"]:
         input_shape = (416, 416)
@@ -213,7 +275,7 @@ def test_yolox_onnx(variant, test_device):
     tt_model = pybuda.OnnxModule(model_name, onnx_model, onnx_model_path)
 
     # PCC
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    if test_device.arch in [BackendDevice.Wormhole_B0, BackendDevice.Blackhole]:
         if variant == "yolox_nano":
             pcc = 0.93
         else:
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
index f302119e..a1165717 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
@@ -159,8 +159,13 @@ def test_blaze_palm_pytorch_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     # Set PyBDUA environment variable
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
     
@@ -207,7 +212,11 @@ def test_blaze_hand_pytorch_1x1(test_device):
         pytest.skip()
 
     # Set PyBDUA environment variable
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index 33040944..a4413b8f 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -68,6 +68,11 @@ def test_efficientnet_timm(variant, test_device):
             compiler_cfg.amp_level = 1
             compiler_cfg.default_df_override=pybuda.DataFormat.Float16_b
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        elif test_device.arch == BackendDevice.Blackhole:
+            pcc_value = 0.92
+            compiler_cfg.amp_level = 1
+            compiler_cfg.default_df_override=pybuda.DataFormat.Float16_b
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     # Load model
     framework_model = download_model(timm.create_model, variant, pretrained=True)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
index 6cb2dddf..90221959 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
@@ -48,7 +48,7 @@ def test_efficientnet_lite_0_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
 
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    if test_device.arch == BackendDevice.Wormhole_B0 or test_device.arch == BackendDevice.Blackhole:
         os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
     elif test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
index 0ed81cb7..d2d3f5ab 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
@@ -31,6 +31,14 @@ def test_hardnet_pytorch(test_device, variant):
         os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
         os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        if variant == "hardnet68ds":
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+        elif variant == "hardnet85":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "48000"
+
     # load only the model architecture without pre-trained weights.
     model = torch.hub.load("PingoLH/Pytorch-HarDNet", variant, pretrained=False)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
index 79ca4a4a..0b3b714e 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
@@ -102,6 +102,13 @@ def test_inception_v4_osmr_pytorch(test_device):
         test_device, "inceptionv4",
     )
 
+    if test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg = pybuda.config._get_global_compiler_config()
+        compiler_cfg.balancer_policy = "Ribbon"
+        compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+        compiler_cfg.enable_auto_fusing = False
+        compiler_cfg.place_on_new_epoch("multiply_35")
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
@@ -154,6 +161,13 @@ def test_inception_v4_timm_pytorch(test_device):
         test_device, 'inception_v4',
     )
 
+    if test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg = pybuda.config._get_global_compiler_config()
+        compiler_cfg.balancer_policy = "Ribbon"
+        compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+        compiler_cfg.enable_auto_fusing = False
+        compiler_cfg.place_on_new_epoch("multiply_35")
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
index 3c2348be..2c598924 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
@@ -20,7 +20,11 @@ def test_mobilenet_v1_ssd_pytorch_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
     
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
 
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = (
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
index 9b0a2958..9f5af7d9 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
@@ -25,6 +25,8 @@ def test_monodle_pytorch(test_device):
     elif test_device.arch == pybuda.BackendDevice.Grayskull:
         os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
         pcc = 0.93
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        pcc = 0.97
 
     model_name = "monodle_pytorch"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
index 129155b8..51b86d0c 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_conv.py
@@ -58,6 +58,9 @@ def test_perceiverio_conv_imgcls_pytorch(test_device):
         if test_device.devtype == pybuda.BackendType.Silicon:
             verify_enabled = False
 
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        pcc_value = 0.97
+
     # Sample Image
     pixel_values = get_sample_data(variant)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
index 62c5fa65..061f5867 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_fourier.py
@@ -64,6 +64,20 @@ def test_perceiverio_fourier_imgcls_pytorch(test_device):
         if test_device.devtype == pybuda.BackendType.Silicon:
             verify_enabled = False
 
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.enable_auto_fusing = False
+        compiler_cfg.balancer_op_override(
+            "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 4)
+        )
+        # compiler_cfg.balancer_op_override("add_33", "t_stream_shape", (1, 2))
+        compiler_cfg.place_on_new_epoch("matmul_30")
+        compiler_cfg.place_on_new_epoch("hslice_41.dc.sparse_matmul.2.lc2")
+        compiler_cfg.place_on_new_epoch("multiply_32")
+        compiler_cfg.place_on_new_epoch("matmul_45")
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.96
+
     # Sample Image
     pixel_values = get_sample_data(variant)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
index 4a4bb6b3..eefff8c5 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio_learned.py
@@ -58,6 +58,13 @@ def test_perceiverio_learned_imgcls_pytorch(test_device):
         if test_device.devtype == pybuda.BackendType.Silicon:
             verify_enabled = False
 
+    elif test_device.arch == pybuda.BackendDevice.Blackhole:
+        os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.92
+
     # Sample Image
     pixel_values = get_sample_data(variant)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
index 6f891074..fa7ffdba 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
@@ -138,6 +138,36 @@ def test_pidnet_pytorch(variant, test_device):
                 (1, 16),
             )
 
+    if test_device.arch == pybuda.BackendDevice.Blackhole:
+        if variant == "pidnet_s":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "217088"
+            compiler_cfg.balancer_op_override(
+                "conv2d_214.dc.reshape.12.dc.sparse_matmul.1.lc2", "t_stream_shape", (1, 4)
+            )
+            compiler_cfg.amp_level = 1
+            compiler_cfg.balancer_op_override(
+                "conv2d_377.dc.conv2d.5.dc.reshape.0_operand_commute_clone132.dc.sparse_matmul.4.lc2",
+                "t_stream_shape",
+                (1, 8),
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_377.dc.conv2d.5.dc.reshape.0_operand_commute_clone107_operand_commute_clone134.dc.sparse_matmul.4.lc2",
+                "t_stream_shape",
+                (1, 8),
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_1010.dc.reshape.0_operand_commute_clone513_operand_commute_clone607.dc.sparse_matmul.4.lc2",
+                "t_stream_shape",
+                (1, 8),
+            )
+            compiler_cfg.place_on_new_epoch("conv2d_960.dc.reshape.12.dc.sparse_matmul.10.lc2")
+            compiler_cfg.balancer_op_override(
+                "conv2d_1010.dc.reshape.0_operand_commute_clone605.dc.sparse_matmul.4.lc2",
+                "t_stream_shape",
+                (1, 8),
+            )
+            compiler_cfg.place_on_new_epoch("resize2d_353.dc.reshape.5.dc.sparse_matmul.10.lc2")
+
     # Load model
     cfg_model_pretrained, cfg_model_state_file = update_model_config(variant)
     model = get_seg_model(variant, cfg_model_pretrained, imgnet_pretrained=True)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
index 66318a1a..c449698a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls_1.py
@@ -64,6 +64,10 @@ def test_segformer_imgcls_pytorch_1(test_device, variant):
         ):
             pcc_value = 0.97
 
+    if test_device.arch == pybuda.BackendDevice.Blackhole:
+        if variant == "nvidia/mit-b0" and test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.97
+
     # Set model configurations
     config = SegformerConfig.from_pretrained(variant)
     config_dict = config.to_dict()
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
index d5c2fa26..a2b44a28 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
@@ -74,6 +74,10 @@ def test_pytorch_ssd300_resnet50(test_device):
         compiler_cfg.place_on_new_epoch("conv2d_766.dc.matmul.11")
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "45056"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg.place_on_new_epoch("conv2d_766.dc.matmul.11")
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "54000"
+
     # STEP 2 : prepare model
     model = torch.hub.load(
         "NVIDIA/DeepLearningExamples:torchhub", "nvidia_ssd", pretrained=False
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
index 63a0a6ed..2a2e0da2 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
@@ -53,6 +53,10 @@ def test_vit_classify_224_hf_pytorch(variant, test_device):
         test_device, variant,
     )
 
+    if test_device.arch == BackendDevice.Blackhole:
+        if variant == "google/vit-large-patch16-224":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{20*1024}"
+
     if "PYBUDA_NEB_GALAXY_CI" in os.environ:
         chip_ids = [0, 11, 10, 9, 8, 7, 19, 20, 21, 22, 23, 24, 6, 5, 14, 13, 12, 16, 15, 3, 4, 26, 25, 32, 31, 30, 29, 28, 27, 1, 2, 18, 17]
     else:
@@ -79,7 +83,12 @@ def test_vit_classify_224_hf_pytorch_1x1(variant, test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+        
     if "large" in variant:
         os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "20000"
 
@@ -116,11 +125,12 @@ def test_vit_classification_1x1_demo(test_device, mode, variant):
         pytest.skip("Not supported")
 
     # Setup for 1x1 grid
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    elif test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
     elif test_device.arch == BackendDevice.Blackhole:
         os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
-
+    
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
index d54c4102..d02a27b6 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
@@ -117,7 +117,12 @@ def test_yolov3_holli_pytorch_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+        
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     model, inputs, other = generate_model_yoloV3_imgcls_holli_pytorch(
         test_device, None,
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index df5c257a..4d63b95b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -51,6 +51,9 @@ def generate_model_yoloV5I320_imgcls_torchhub_pytorch(test_device, variant, size
         if size == "l" or size == "m" or size == "x":
             compiler_cfg.enable_auto_fusing = False
 
+    elif test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg.default_df_override = DataFormat.Float16_b
+
     name = "yolov5" + size
 
     model = download_model(torch.hub.load, variant, name, pretrained=True)
@@ -178,6 +181,18 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{98*1024}"
 
+    elif test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg.default_df_override = DataFormat.Float16_b
+
+        if size == "s":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{14*1024}"
+        elif size == "l":
+            compiler_cfg.enable_auto_transposing_placement = True
+            compiler_cfg.enable_tm_cpu_fallback = True
+            compiler_cfg.balancer_op_override("conv2d_328.dc.matmul.8", "grid_shape", (5,2))
+            os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "0"
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{92*1024}"
+
     name = "yolov5" + size
     model = download_model(torch.hub.load, variant, name, pretrained=True)
     module = PyTorchModule("pt_" + name + "_640x640", model)
@@ -278,14 +293,26 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
     "size", size, ids=["yolov5" + s for s in size]
 )
 def test_yolov5_480x480(test_device, size):
+    compiler_cfg = _get_global_compiler_config()
+
     if test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
-    if size in ["m", "l"] and test_device.arch == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
-    if size in ["s"] and test_device.arch == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
-    if size in ["x"] and test_device.arch == BackendDevice.Wormhole_B0:
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{68*1024}"
+
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        if size in ["m", "l"]:
+            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+
+        elif size in ["s"]:
+            os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
+        elif size in ["x"]:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{68*1024}"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg.default_df_override = DataFormat.Float16_b
+
+        if size in ["x"]:
+            compiler_cfg.place_on_new_epoch("conv2d_44.dc.matmul.11")
 
     model, inputs, _ = generate_model_yoloV5I480_imgcls_torchhub_pytorch(
         test_device, "ultralytics/yolov5",
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
index a59ff975..5352fc15 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
@@ -122,10 +122,10 @@ def test_yolo_v6_pytorch(variant, test_device):
                 (2, 1),
             )
 
-        if test_device.arch == BackendDevice.Wormhole_B0 and variant == "yolov6l":
+        if test_device.arch in [BackendDevice.Wormhole_B0, BackendDevice.Blackhole] and variant == "yolov6l":
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
-        if test_device.arch == BackendDevice.Grayskull and variant == "yolov6l":
+        if test_device.arch in [BackendDevice.Grayskull] and variant == "yolov6l":
             compiler_cfg.balancer_op_override(
                 "conv2d_484.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (1, 1)
             )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
index d29249f9..c9572d2d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_x.py
@@ -183,6 +183,74 @@ def test_yolox_pytorch(variant, test_device):
                 compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
                 compiler_cfg.place_on_new_epoch("concatenate_2264.dc.sparse_matmul.11.lc2")
 
+    elif test_device.arch == BackendDevice.Blackhole:
+        if variant in ["yolox_nano", "yolox_tiny"]:
+            compiler_cfg.place_on_new_epoch("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.14.lc2")
+            compiler_cfg.place_on_new_epoch("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.14.lc2")
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "81920"
+
+        elif variant == "yolox_s":
+            compiler_cfg.balancer_op_override("concatenate_1163.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "grid_shape", (1, 5))
+            compiler_cfg.place_on_new_epoch("concatenate_1163.dc.sparse_matmul.11.lc2")
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+        elif variant == "yolox_m":
+            compiler_cfg.place_on_new_epoch("conv2d_811.dc.matmul.8")
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 6))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", "t_stream_shape", (5, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.place_on_new_epoch("concatenate_1530.dc.sparse_matmul.11.lc2")
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+        elif variant in ["yolox_l", "yolox_darknet", "yolox_x"]:
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+            if variant == "yolox_l":
+                compiler_cfg.place_on_new_epoch("conv2d_1410.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1644.dc.matmul.11")
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_1897.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_darknet":
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.3.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.5.dc.matmul.11")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "12288"
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.5.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2")
+                compiler_cfg.place_on_new_epoch("conv2d_1070.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1147.dc.matmul.11")
+                compiler_cfg.balancer_op_override("concatenate_1242.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+
+            elif variant == "yolox_x":
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.5.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1717.dc.conv2d.3.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1699.dc.matmul.8")
+                compiler_cfg.place_on_new_epoch("conv2d_1732.dc.matmul.8")
+                compiler_cfg.place_on_new_epoch("conv2d_1981.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_1736.dc.matmul.11")
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_2264.dc.sparse_matmul.11.lc2")
+
     # prepare model
     weight_name = f"{variant}.pth"
     url = f"https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/{weight_name}"
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
index 48e51296..33e2028d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
@@ -27,7 +27,12 @@ def test_efficientnet_lite0_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     compiler_cfg = _get_global_compiler_config()
@@ -57,7 +62,12 @@ def test_efficientnet_lite4_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+        
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     compiler_cfg = _get_global_compiler_config()
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
index a6d5c577..2bc244c8 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
@@ -27,7 +27,12 @@ def test_hand_landmark_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_ENABLE_SINGLE_BUFFER_FALLBACK"] = "1"
 
@@ -62,7 +67,12 @@ def test_palm_detection_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+        
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     compiler_cfg = _get_global_compiler_config()
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
index f6cbbe39..9ccfce19 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
@@ -27,7 +27,12 @@ def test_mobilenet_ssd_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+        
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     compiler_cfg = _get_global_compiler_config()
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
index 98ec9790..4a760f96 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
@@ -27,7 +27,12 @@ def test_pose_landmark_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     os.environ["PYBUDA_SPLIT_RESIZE2D"] = "128"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_MAX_CONCAT_INPUTS"] = "6"
@@ -63,7 +68,12 @@ def test_pose_landmark_heavy_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    elif test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+
     os.environ["PYBUDA_SPLIT_RESIZE2D"] = "128"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["PYBUDA_MAX_CONCAT_INPUTS"] = "6"
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
index f4c4df5e..3a1a7fc8 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
@@ -101,6 +101,9 @@ def test_bert_question_answering_pytorch(test_device):
         test_device, "bert-large-cased-whole-word-masking-finetuned-squad",
     )
 
+    if test_device.arch == BackendDevice.Blackhole:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{42*1024}"
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
index a5f9ad10..d2c103bf 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
@@ -47,6 +47,12 @@ def test_codegen(test_device, variant):
             elif variant == "Salesforce/codegen-350M-nl":
                 compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16
                 pcc_value = 0.90
+    elif test_device.arch == BackendDevice.Blackhole:
+        if test_device.devtype == BackendType.Silicon:
+            if variant == "Salesforce/codegen-350M-multi":
+                pcc_value = 0.96
+            elif variant == "Salesforce/codegen-350M-nl":
+                pcc_value = 0.95
 
     # Load model (with tokenizer)
     tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
index d77df50d..bdb44625 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
@@ -364,7 +364,12 @@ def test_gemma_2b_1x1(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
 
-    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    elif test_device.arch == BackendDevice.Blackhole:
+        os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "blackhole_1x1.yaml"
+    
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
index d7779da2..3ce2963e 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
@@ -46,6 +46,10 @@ def test_gptneo_causal_lm(variant, test_device):
         if variant == "EleutherAI/gpt-neo-1.3B":
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "76444"
 
+    elif test_device.arch == BackendDevice.Blackhole:
+        if variant == "EleutherAI/gpt-neo-125M":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{12*1024}"
+
     # Load tokenizer and model
     # Variants: # EleutherAI/gpt-neo-125M, EleutherAI/gpt-neo-1.3B,
     # EleutherAI/gpt-neo-2.7B
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
index bd51bf91..e64a0dc7 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
@@ -2,6 +2,7 @@
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendType, BackendDevice
 import torch
 from transformers import PhiForCausalLM, AutoTokenizer, PhiConfig
 import os
@@ -25,6 +26,9 @@ def test_phi2_clm(test_device, variant):
     compiler_cfg.amp_level = 1
     os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{32*1024}"
+
     # Load PhiConfig from pretrained variant, disable return_dict and caching.
     config = PhiConfig.from_pretrained(variant)
     config_dict = config.to_dict()
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
index f9eadb41..cb4ada84 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
@@ -81,12 +81,14 @@ def test_t5_generation(variant, test_device):
     # import os
     # os.environ["PYBUDA_ENABLE_TINY_TILE"] = "1"
     # Add PyBUDA configurations
-    compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_tvm_cpu_fallback = False
-    compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
-    compiler_cfg.amp_level = 1
-    compiler_cfg.enable_enumerate_u_kt = False
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    if test_device.arch != BackendDevice.Blackhole or variant not in ["t5-base", "google/flan-t5-base"]:
+        compiler_cfg = pybuda.config._get_global_compiler_config()
+        compiler_cfg.enable_tvm_cpu_fallback = False
+        compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
+        compiler_cfg.amp_level = 1
+        compiler_cfg.enable_enumerate_u_kt = False
+        compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+
     if "large" in variant:
         os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
index 171db204..dc2a7dbe 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
@@ -129,6 +129,37 @@ def test_whisper_enc_dec(test_device, variant):
         #    compiler_cfg.enable_enumerate_u_kt = False
             os.environ["PYBUDA_TEMP_RIBBON2_LEGACY_UTIL_EVAL"] = "1"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        compiler_cfg.amp_level = 1
+        os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
+        os.environ["PYBUDA_PAD_OUTPUT_BUFFER_THRESHOLD_TILES"] = "1536"
+
+        os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
+        os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
+        os.environ["PYBUDA_NOP_ON_DIRECT_SHORT_PATH"] = "1"
+        os.environ["PYBUDA_SKIP_SMALL_UKT"] = "1"
+
+
+        if variant == "openai/whisper-base":
+            os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "None"
+            compiler_cfg.enable_auto_fusing = False
+
+        if variant == "openai/whisper-small":
+            os.environ["PYBUDA_DISABLE_SELF_CUT_FOR_SUBGRAPHS"] = "1, 2"
+            compiler_cfg.enable_auto_fusing = False
+
+        if variant == "openai/whisper-medium":
+            os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "None"
+            compiler_cfg.enable_auto_fusing = False
+            compiler_cfg.balancer_op_override("layernorm_66.dc.add.14", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("layernorm_1193.dc.add.14", "t_stream_shape", (1,1))
+        
+        if variant == "openai/whisper-large":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "0"
+            os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+            compiler_cfg.enable_auto_fusing = False
+            compiler_cfg.place_on_new_epoch("matmul_2818")
+
     run_encoder_on_tt = ("tiny" in variant) or ("base" in variant) or ("small" in variant) or ("medium" in variant)
 
     pad_model = True
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
index 816741b0..19f4c186 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
@@ -30,6 +30,9 @@ def test_xglm_causal_lm(variant, test_device):
     if (test_device.arch == BackendDevice.Grayskull and variant == "facebook/xglm-564M") or (test_device.arch == BackendDevice.Wormhole_B0):
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
 
+    if test_device.arch == BackendDevice.Blackhole:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{41*1024}"
+
     # Load tokenizer and model from HuggingFace
     # Variants: "facebook/xglm-564M", "facebook/xglm-1.7B"
 
@@ -54,7 +57,7 @@ def test_xglm_causal_lm(variant, test_device):
     )   
 
     pcc_value = 0.99
-    if test_device.arch == BackendDevice.Wormhole_B0 and test_device.devtype == BackendType.Silicon:
+    if test_device.arch in [BackendDevice.Wormhole_B0, BackendDevice.Blackhole] and test_device.devtype == BackendType.Silicon:
         if variant == "facebook/xglm-564M":
             pcc_value = 0.91
         elif variant == "facebook/xglm-1.7B":

From 37355f836b68da657d30a6b40b6b955251004c97 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Wed, 7 Aug 2024 12:50:12 +0000
Subject: [PATCH 091/116] Failing reasons

Categorization of bugs via common messages for test xfail
and skip failing reasons.

Issue #2829

(cherry picked from commit 4a1bb1b0f9cdab7258e9fa9cbd0b855854f03b4c)
---
 .../eltwise_binary/test_eltwise_binary.py     | 57 ++++++++++--------
 .../eltwise_unary/test_eltwise_unary.py       | 60 ++++++++++---------
 pybuda/test/operators/matmul/test_matmul.py   | 24 +++++---
 .../operators/matmul/test_matmul_pytorch.py   |  8 ++-
 .../operators/matmul/test_sparse_matmul.py    | 27 +++++----
 .../test/operators/nary/test_concatenate.py   | 28 +++++----
 pybuda/test/operators/nary/test_index_copy.py |  8 ++-
 pybuda/test/operators/nary/test_stack.py      | 10 +++-
 pybuda/test/operators/nary/test_where.py      | 19 ++++--
 pybuda/test/operators/utils/__init__.py       |  2 +
 .../test/operators/utils/failing_reasons.py   | 50 ++++++++++++++++
 11 files changed, 200 insertions(+), 93 deletions(-)
 create mode 100644 pybuda/test/operators/utils/failing_reasons.py

diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index 7f80533c..a29aa222 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -72,6 +72,7 @@
 from pybuda.op_repo import TensorShape
 from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
+from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 
 from pybuda import TTDevice, pybuda_compile, VerifyConfig, CompilerConfig
@@ -313,30 +314,31 @@ def get_input_shapes():
             
             # 2-dimensional shape, microbatch_size > 1:
             # All shapes fails for all operators
+            # Skip shapes where microbatchsize > 1
             pytest.param((3, 4),        #13      # 3.1 Full tensor (i.e. full expected shape)
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((45, 17),      #14      # 3.1 Full tensor (i.e. full expected shape)
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((64, 1),       #15      # 3.2 Tensor reduce on one or more dims to 1
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((100, 100),    #16      # 4.3 Very large (thousands, 10s of thousands)
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((1000, 100),   #17      # 4.3 Very large (thousands, 10s of thousands)
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((10, 1000),    #18      # 4.4 Extreme ratios between height/width
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((9920, 1),     #19      # 4.4 Extreme ratios between height/width  
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((10000, 1),    #20      # 4.4 Extreme ratios between height/width 
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((32, 64),      #21      # 4.1 Divisible by 32
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((160, 96),     #22      # 4.1 Divisible by 32
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((17, 41),      #23      # 4.2 Prime numbers
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((89, 3),       #24      # 4.2 Prime numbers
-                         marks=pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1")),
+                         marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
 
             # 3-dimensional shape, microbatch_size = 1:
             (1, 3, 4),                  #25     # 3.1 Full tensor (i.e. full expected shape)
@@ -392,8 +394,9 @@ def get_input_shapes():
             (9, 1, 9920, 1),                  #69     # 4.4 Extreme ratios between height/width
             (10, 10, 10000, 1),               #70     # 4.4 Extreme ratios between height/width
             (11, 32, 32, 64),                 #71     # 4.1 Divisible by 32
+            # RuntimeError: Fatal Python error: Segmentation fault
             pytest.param((12, 64, 160, 96),   #72     # 4.1 Divisible by 32
-                         marks=pytest.mark.skip(reason="RuntimeError: Fatal Python error: Segmentation fault")),
+                         marks=pytest.mark.skip(reason=FailingReasons.SEG_FAULT)),
             (13, 11, 17, 41),                 #73     # 4.2 Prime numbers
             (14, 13, 89, 3),                  #74     # 4.2 Prime numbers
     ]
@@ -418,22 +421,24 @@ def test_eltwise_binary_ops_per_test_plan(
         # Error Message: "RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/policies/policy_utils.cpp:2221: " + 
         #                "graph ->get_edges( graph->get_node_by_name(nopInsertInst->src), " +
         #                "graph->get_node_by_name(nopInsertInst->dest)) .size() == 1"
-        pytest.xfail(reason="Buggy shapes for ModelOpSrcFromTmEdge1.")
+        pytest.xfail(reason=FailingReasons.BUGGY_SHAPE)
     # 2. input_shape in ((1, 9920, 1), (1, 1, 9920, 1), (9, 1, 9920, 1)):
     if model_type == ModelFromAnotherOp and input_operator in ["Equal", "NotEqual"] and input_shape in (s[32], s[56], s[69]):
         # Error Mesage: "RuntimeError: Fatal balancer error: Could not reconcile constraints: path[Add0 -> _fused_op_0]"
-        pytest.xfail(reason="Buggy shapes for ModelFromAnotherOp.")
+        pytest.xfail(reason=FailingReasons.BUGGY_SHAPE)
     # 3. BinaryStack bugs:
     if input_operator == "BinaryStack":
         if len(input_shape) in (2, 3):
             # input_shapes are 2-dimensional and 3-dimensional:
-            pytest.xfail(reason="BinaryStack operator is not working for 2D and 3D shapes.")
+            # BinaryStack operator is not working for 2D and 3D shapes
+            pytest.xfail(reason=FailingReasons.UNSUPPORTED_DIMENSION)
         elif model_type == ModelConstEvalPass:
             # model_type is ModelConstEvalPass:
-            pytest.xfail(reason="BinaryStack operator is not working for ModelConstEvalPass.")
+            pytest.xfail(reason=FailingReasons.UNSUPPORTED_INPUT_SOURCE)
         elif input_shape in (s[55], s[56], s[57], s[68], s[69], s[70]):
             # input_shapes are all with extreme ratios between height/width:
-            pytest.xfail(reason="BinaryStack operator is not working for shapes that have extreme ratios between height/width")
+            # BinaryStack operator is not working for shapes that have extreme ratios between height/width
+            pytest.xfail(reason=FailingReasons.BUGGY_SHAPE)
     # ------------------------------------------------------------------------------------------------------------------------------------
 
 
@@ -477,8 +482,9 @@ def get_eltwise_binary_ops_prologued():
         pytest.param("Add"),              #00
         pytest.param("Max"),              #01
         pytest.param("Min"),              #02
+        # Validation error caused by pcc threshold
         pytest.param("Power",             #03
-                     marks=pytest.mark.xfail(reason="Validation error caused by pcc threshold.")),
+                     marks=pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)),
         pytest.param("Subtract"),         #04
         pytest.param("Multiply"),         #05
         pytest.param("Heaviside"),        #06
@@ -500,10 +506,11 @@ def get_input_shapes_prologued():
             ((1, 17),        InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #01        # 3.1 Full tensor (i.e. full expected shape)
             
             # 2-dimensional shape, microbatch_size > 1:
+            # "Doesn't work for microbatchsize > 1 and two dimensions.
             pytest.param((4, 16), InputSourceFlags.FROM_DRAM_PROLOGUED, True,              #02        # 3.1 Full tensor (i.e. full expected shape)
-                    marks=pytest.mark.xfail(reason="Doesn't work for microbatchsize > 1 and two dimensions.")),
+                    marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             pytest.param((3, 17), InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False,         #03        # 3.1 Full tensor (i.e. full expected shape)
-                    marks=pytest.mark.xfail(reason="Doesn't work for microbatchsize > 1 and two dimensions.")),
+                    marks=pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)),
             
             # 3-dimensional shape:
             ((2, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED, False),             #04        # 3.1 Full tensor (i.e. full expected shape)
@@ -551,7 +558,8 @@ def test_eltwise_binary_ops_per_test_plan_dram_prologued(
     # 1. BinaryStack bugs:
     if input_operator == "BinaryStack" and len(input_shape) in (2, 3):
         # input_shapes are 2-dimensional and 3-dimensional:
-        pytest.xfail(reason="BinaryStack operator is not working for 2D and 3D shapes.")
+        # BinaryStack operator is not working for 2D and 3D shapes.
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_DIMENSION)
     # -----------------------------------------------------------------------------------------------------------------------------------
 
     # Divide behaves differently from another operators for this shape
@@ -675,7 +683,7 @@ def test_df_eltwise_binary_ops_per_test_plan(input_operator, model_type, test_de
 # Error Message: "Compile error: 'logical_and'"
 # ...
 # Error Message: "KeyError: 'logical_and'"
-@pytest.mark.xfail(reason="Not implemented")
+@pytest.mark.xfail(reason=FailingReasons.NOT_IMPLEMENTED)
 def test_eltwise_binary_logicaland_operator(test_device):
 
     verify(
@@ -690,7 +698,8 @@ def test_eltwise_binary_logicaland_operator(test_device):
 # It is not clear what the operator should do, because the documentation is missing - it is copied from Max operator.
 # Case with dim=-1 is covered with other operators in test "test_eltwise_binary_ops_per_test_plan".
 # This test covers all other values for dim parameter.
-@pytest.mark.xfail(reason="Operator is not working for dim parameter different than -1.")
+# Operator is not working for dim parameter different than -1.
+@pytest.mark.xfail(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("shape", [(1, 3, 3, 3)])
 @pytest.mark.parametrize("dim", [-2, 0, 1, 2])
 @pytest.mark.parametrize("model", [ModelFromHost, ModelFromAnotherOp])
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
index 5394bd15..111c43ea 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
@@ -67,6 +67,7 @@
 from pybuda.verify.config import TestKind
 
 from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
+from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 
 from pybuda.module import PyBudaModule
@@ -255,7 +256,7 @@ def get_input_shapes():
             (10, 10, 10000, 1),             #73     # 4.4 Extreme ratios between height/width       
             (11, 32, 32, 64),               #74     # 4.1 Divisible by 32  
             #Fatal Python error: Segmentation fault 
-            pytest.param((12, 64, 160, 96), marks=pytest.mark.skip(reason="Inference fail due to seg fault")), #75     # 4.1 Divisible by 32          
+            pytest.param((12, 64, 160, 96), marks=pytest.mark.skip(reason=FailingReasons.SEG_FAULT)), #75     # 4.1 Divisible by 32          
             (13, 11, 17, 41),               #76     # 4.2 Prime numbers      
             (14, 13, 89, 3),                #77     # 4.2 Prime numbers      
     ]
@@ -271,93 +272,93 @@ def xfail_test(input_operator, input_shape, input_model, input_kwargs):
         case "Argmax":
             if(len(input_shape) == 2 and micro_batch_size > 1 and input_model in ("model_op_src_from_another_op", "model_op_src_from_tm_edge2")):
                 # E           AssertionError: Error during inference
-                pytest.xfail("Inference failed")
+                pytest.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED)
             elif(input_shape in ((s[16],) + (s[20],) + (s[21],)) and input_model == "model_op_src_from_tm_edge1"):
                 # E           AssertionError: Error during inference
-                pytest.xfail("Inference failed")
+                pytest.xfail(reason=FailingReasons.BUGGY_SHAPE)
             elif(input_shape in ((s[31],) + (s[33],) + (s[36],) + (s[44],) + (s[46],) + (s[49],)+ (s[56],) + (s[57],) + (s[59],) + tuple(s[60:63]) + (s[69],) + (s[70],) + tuple(s[72:75]))):
                 # E           RuntimeError: 1/2/3 Nodes have no valid grids, exiting
-                pytest.xfail("RuntimeError")
+                pytest.xfail(reason=FailingReasons.BUGGY_SHAPE)
         case "Dropout":
             # Error message: E       AssertionError: Data mismatch detected
-            pytest.xfail("Data mismatch")
+            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
         case "LogicalNot":
             # Error message: E               KeyError: 'logical_not'
-            pytest.xfail("Not implemented operator")
+            pytest.xfail(reason=FailingReasons.NOT_IMPLEMENTED)
         case "Tilize":
             # Error message: E       AttributeError: module 'torch' has no attribute 'tensors'
-            pytest.xfail("Inference failed")
+            pytest.xfail(reason=FailingReasons.INFERENCE_FAILED)
         case "CumSum":
             if input_model in ("model_op_src_from_dram", "model_op_src_from_host", "model_op_src_from_another_op"):
                 # E               RuntimeError: Input operand not mapped to new graph during lowering: CumSum1
-                pytest.xfail("RuntimeError")
+                pytest.xfail(reason=FailingReasons.COMPILATION_FAILED)
             elif input_model in ("model_op_src_const_inputs1", "model_op_src_from_tm_edge1", "model_op_src_from_tm_edge2"):
                 # E               RuntimeError: TT_ASSERT @ pybuda/csrc/passes/lowering_context.cpp:28: old_node->node_type() != graphlib::NodeType::kPyOp
-                pytest.xfail("RuntimeError")
+                pytest.xfail(reason=FailingReasons.COMPILATION_FAILED)
         case "Pow":
             if(micro_batch_size > 1):
                 if(input_kwargs['exponent'] not in (1000, 10000) and len(input_shape) == 2):
                     # E           AssertionError: Error during inference
-                    pytest.xfail("Inference failed")
+                    pytest.xfail(reason=FailingReasons.INFERENCE_FAILED)
                 elif(input_kwargs['exponent'] == 1000):
                     if(input_shape in (tuple(s[13:26]))):
                         # E           AssertionError: Error during inference
-                        pytest.xfail('Inference failed')
+                        pytest.xfail(reason=FailingReasons.INFERENCE_FAILED)
                     elif(input_model in ("model_op_src_from_host", "model_op_src_from_tm_edge1", "model_op_src_from_dram") and input_shape in ((s[39],) + (s[41],) + (s[66],))):
                         # E           AssertionError: Data mismatch detected
-                        pytest.xfail('Data missmatch')
+                        pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     elif(input_model in ("model_op_src_const_inputs1") and input_shape in (s[39],)):
                         # E           AssertionError: Data mismatch detected
-                        pytest.xfail('Data missmatch')
+                        pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                 elif(input_kwargs['exponent'] == 10000):
                     if(input_shape in (tuple(s[13:26]))):
                         # E           AssertionError: Error during inference
-                        pytest.xfail('Inference failed')
+                        pytest.xfail(reason=FailingReasons.INFERENCE_FAILED)
                     elif(input_model in ("model_op_src_from_host", "model_op_src_from_tm_edge1", "model_op_src_from_dram") and input_shape in (tuple(s[39:52]) + tuple(s[65:69]) + tuple(s[71:75]) + tuple(s[76:78]))):
                         # E           AssertionError: Data mismatch detected
-                        pytest.xfail('Data missmatch')
+                        pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     elif(input_model in ("model_op_src_const_inputs1") and input_shape in ((s[39],) + (s[41],) + (s[66],))):
                         # E           AssertionError: Data mismatch detected
-                        pytest.xfail('Data missmatch')
+                        pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
             else:
                 match input_model:
                     case "model_op_src_from_host":
                         if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],)) ):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                         elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     case "model_op_src_from_dram":
                         if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                         elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     case "model_op_src_const_inputs1":
                         if (input_kwargs['exponent'] == 160 and input_shape in (s[3],)):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                         elif (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:2]) + (s[3],) + (s[12],) + (s[26],))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                         elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:4]) + tuple(s[11:13]) + (s[26],) + (s[28],) + (s[52],))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     case "model_op_src_from_tm_edge1":
                         if (input_kwargs['exponent'] == 1000 and input_shape in (tuple(s[0:5]) + tuple(s[9:13]) + (s[26],) + (s[28],) + (s[29],) + (s[38],) + (s[52],) + (s[54],))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                         elif (input_kwargs['exponent'] == 10000 and input_shape in (tuple(s[0:13]) + tuple(s[26:39]) + tuple(s[52:65]))):
                             # E           AssertionError: Data mismatch detected
-                            pytest.xfail('Data missmatch')
+                            pytest.xfail(reason=FailingReasons.DATA_MISMATCH)
                     case "model_op_src_from_another_op", "model_op_src_from_tm_edge2":
                         return
         case _:
             if(len(input_shape) == 2 and micro_batch_size > 1):
                 # E           AssertionError: Error during inference
-                pytest.xfail('Inference failed')
+                pytest.xfail(reason=FailingReasons.INFERENCE_FAILED)
 
 
 
@@ -400,7 +401,7 @@ def get_pow_kwargs():
     return [
         # Error message: E                RuntimeError: TT_ASSERT @ pybuda/csrc/graph_lib/shape.cpp:34: values.size() >= BUDA_DIM_COUNT and values.size() <= BUDA_MAX_DIM_COUNT
         # 18 are always xpassed
-        pytest.param(0.9336911808323198,    marks=pytest.mark.xfail(reason="RuntimeError")),  
+        pytest.param(0.9336911808323198,    marks=pytest.mark.xfail(reason=FailingReasons.COMPILATION_FAILED)),
         0,
         1,
         2,
@@ -451,7 +452,8 @@ def get_clip_kwargs():
         (0.4992656851851959, None),
         (None, 0.9336911808323198),
         # Error message: E               RuntimeError: yaml-cpp: error at line 22, column 70: bad conversion
-        pytest.param(None, None,    marks=pytest.mark.xfail(reason="RuntimeError")),  
+        # Error message: E               RuntimeError: Unexpected index
+        pytest.param(None, None,    marks=pytest.mark.xfail(reason=FailingReasons.COMPILATION_FAILED)),
     ]
 @pytest.mark.parametrize("input_shape", get_input_shapes())
 @pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
@@ -490,7 +492,7 @@ def get_cum_sum_kwargs_exclusive():
     return [
         False,
         # Error message:E   Assertion error: Currently not supported
-        pytest.param(True,    marks=pytest.mark.xfail(reason="Unsupported parameter value"))
+        pytest.param(True,    marks=pytest.mark.xfail(reason=FailingReasons.UNSUPPORTED_PARAMETER_VALUE)),
     ]
 @pytest.mark.parametrize("input_shape", get_input_shapes())
 @pytest.mark.parametrize("input_model", [item.split(".")[0] for item in os.listdir(TEST_PLAN_MODELS_PATH) if "model" in item])
diff --git a/pybuda/test/operators/matmul/test_matmul.py b/pybuda/test/operators/matmul/test_matmul.py
index 7f4c28b3..29af7659 100644
--- a/pybuda/test/operators/matmul/test_matmul.py
+++ b/pybuda/test/operators/matmul/test_matmul.py
@@ -85,6 +85,7 @@
 from pybuda.op.eval.common import compare_tensor_to_golden
 
 from test.operators.utils import netlist_utils
+from test.operators.utils import FailingReasons
 
 from .models import generic
 from .models import custom
@@ -199,21 +200,28 @@ def test_matmul_according_to_pytorch_docs(
     test_device
 ):
 
+    # TODO Unify models 11 to 15 by parametrizing the input shapes
+
     #BUG
     if model in ("model_11", ):
-        pytest.xfail("Matmul op when two input tensors are vectors is not supported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:135: E    IndexError: list index out of range")
+        # Matmul op when two input tensors are vectors is not supported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:135: E    IndexError: list index out of range
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_SPECIAL_CASE)
     #BUG
     if model in ("model_12", ):
-        pytest.xfail("Matmul op when two input tensors are matrix(without microbatch size) is not supported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E     RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3 and 1x7)")
+        # Matmul op when two input tensors are matrix(without microbatch size) is not supported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E     RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3 and 1x7)
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_SPECIAL_CASE)
     #BUG
     if model in ("model_13", ):
-        pytest.xfail("Matmul op if the first argument is 1-dimensional and the second argument is 2-dimensional is not supported. Error: pybuda/pybuda/tensor.py:383: E    AssertionError: Setting a tensor value of incorrect shape: (1, 7) vs torch.Size([7])")
+        # Matmul op if the first argument is 1-dimensional and the second argument is 2-dimensional is not supported. Error: pybuda/pybuda/tensor.py:383: E    AssertionError: Setting a tensor value of incorrect shape: (1, 7) vs torch.Size([7])
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_SPECIAL_CASE)
     #BUG
     if model in ("model_14", ):
-        pytest.xfail("Matmul op if the first argument is 2-dimensional and the second argument is 1-dimensional is not suppported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E    RuntimeError: size mismatch, got input (1), mat (1x64), vec (1)")
+        # Matmul op if the first argument is 2-dimensional and the second argument is 1-dimensional is not suppported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E    RuntimeError: size mismatch, got input (1), mat (1x64), vec (1)
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_SPECIAL_CASE)
     #BUG
     if model in ("model_15", ):
-        pytest.xfail("Matmul op when one of the arguments is 1-dimensional and the other one is N-dimensional is not suppported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E    RuntimeError: size mismatch, got input (32), mat (32x64), vec (1)")
+        # Matmul op when one of the arguments is 1-dimensional and the other one is N-dimensional is not suppported. Error: pybuda/pybuda/op/eval/pybuda/matmul.py:29: E    RuntimeError: size mismatch, got input (32), mat (32x64), vec (1)
+        pytest.xfail(reason=FailingReasons.UNSUPPORTED_SPECIAL_CASE)
 
     architecture = f'special_cases.{model}.BudaMatmulTest()'
     model = eval(architecture)
@@ -287,7 +295,8 @@ def test_matmul_according_to_test_plan(
             "model_op_src_const_inputs2",
             "model_op_src_from_host",
         ):
-        pytest.xfail("Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:")
+        # Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:
+        pytest.xfail(reason=FailingReasons.COMPILATION_FAILED)
 
     # generate input shapes for every model
     opernad_num = 0
@@ -431,7 +440,8 @@ def test_matmul_dram_prologued(
     model = "model_op_src_const_inputs2"
     #BUG: when input shape is (2, 1, 10000, 1) or (2, 10000, 1) - extreme ratios between height/width; it works for input shape when one dimension is 9920 or less, everything above(like 10000) throws error
     if (input_shape == (2, 1, 10000, 1) or input_shape == (2, 10000, 1)) and model == "model_op_src_const_inputs2":
-        pytest.xfail("Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:")
+        # Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:
+        pytest.xfail(reason=FailingReasons.COMPILATION_FAILED)
    
     # generate input shapes
     opernad_num = 0
diff --git a/pybuda/test/operators/matmul/test_matmul_pytorch.py b/pybuda/test/operators/matmul/test_matmul_pytorch.py
index e8a030eb..1a7f88b2 100644
--- a/pybuda/test/operators/matmul/test_matmul_pytorch.py
+++ b/pybuda/test/operators/matmul/test_matmul_pytorch.py
@@ -6,6 +6,8 @@
 
 from pybuda.verify import verify_module, VerifyConfig
 
+from test.operators.utils import FailingReasons
+
 
 class Matmul2ModelPyBuda(pybuda.PyBudaModule):
     '''PyBuda model with two matmul operations'''
@@ -83,21 +85,23 @@ def forward(self, in_value1: torch.Tensor, in_value2: torch.Tensor, in_value3: t
 
     # if fails via PyTorch when operand source for matmul is another matmul and inputs are 3-dimensional tensors
 
+    # 3-dimensional tensors are not working via pytorch if operand source is another matmul ?
     pytest.param(Matmul2ModelPyTorch, [
         (1, 5, 3),
         (1, 3, 2),
         (1, 2, 4),
-    ], marks=pytest.mark.xfail(reason="3-dimensional tensors are not working via pytorch if operand source is another matmul ?")),
+    ], marks=pytest.mark.xfail(reason=FailingReasons.UNSUPPORTED_DIMENSION)),
     # Errors:
     #  - Failed on "ExplicateTranspose" TVM callback
     #  - ValueError: The type checker has not populated the checked_type for this node
 
+    # 3-dimensional tensors are not working via pytorch if operand source is another matmul ?
     # size of shpae dimensions is not important
     pytest.param(Matmul2ModelPyTorch, [
         (1, 64, 32),
         (1, 32, 64),
         (1, 64, 128),
-    ], marks=pytest.mark.xfail(reason="3-dimensional tensors are not working via pytorch if operand source is another matmul ?")),
+    ], marks=pytest.mark.xfail(reason=FailingReasons.UNSUPPORTED_DIMENSION)),
     # Errors:
     #  - Failed on "ExplicateTranspose" TVM callback
     #  - ValueError: The type checker has not populated the checked_type for this node
diff --git a/pybuda/test/operators/matmul/test_sparse_matmul.py b/pybuda/test/operators/matmul/test_sparse_matmul.py
index 00dbfa8c..e5779464 100644
--- a/pybuda/test/operators/matmul/test_sparse_matmul.py
+++ b/pybuda/test/operators/matmul/test_sparse_matmul.py
@@ -70,6 +70,7 @@
 import torch
 
 from test.operators.utils import netlist_utils
+from test.operators.utils import FailingReasons
 
 
 
@@ -347,18 +348,21 @@ def forward(self, x):
                     pytest.param((1, 64, 1, 23),        (23, 1)),                                                               #3        # 3.2 Tensor reduce on one or more dims to 1
                     pytest.param((1, 64, 100, 100),     (100, 100)),                                                            #5        # 4.3 Very large (thousands, 10s of thousands)
 
-                    pytest.param((1, 64, 45, 17),       (17, 45),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #2        # 3.1 Full tensor (i.e. full expected shape)    
-                    pytest.param((1, 64, 64, 1),        (1, 64),     marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #4        # 3.2 Tensor reduce on one or more dims to 1    
-                    pytest.param((1, 64, 1000, 100),    (100, 1000), marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #6        # 4.3 Very large (thousands, 10s of thousands)  
-                    pytest.param((1, 64, 160, 96),      (96, 160),   marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #11       # 4.1 Divisible by 32                           
-                    pytest.param((1, 64, 89, 3),        (3, 89),     marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Error during inference")),        #13       # 4.2 Prime numbers                             
+                    # Error message: E           AssertionError: Error during inference
+                    pytest.param((1, 64, 45, 17),       (17, 45),    marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),        #2        # 3.1 Full tensor (i.e. full expected shape)    
+                    pytest.param((1, 64, 64, 1),        (1, 64),     marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),        #4        # 3.2 Tensor reduce on one or more dims to 1    
+                    pytest.param((1, 64, 1000, 100),    (100, 1000), marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),        #6        # 4.3 Very large (thousands, 10s of thousands)  
+                    pytest.param((1, 64, 160, 96),      (96, 160),   marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),        #11       # 4.1 Divisible by 32                           
+                    pytest.param((1, 64, 89, 3),        (3, 89),     marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),        #13       # 4.2 Prime numbers                             
             
-                    pytest.param((1, 64, 10, 1000),     (1000, 10),  marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #7        # 4.4 Extreme ratios between height/width       
-                    pytest.param((1, 64, 32, 64),       (64, 32),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #10       # 4.1 Divisible by 32                           
-                    pytest.param((1, 64, 17, 41),       (41, 17),    marks=pytest.mark.xfail(reason="Error message: E           AssertionError: Data mismatch detected")),           #12       # 4.2 Prime numbers                             
+                    # "Error message: E           AssertionError: Data mismatch detected"
+                    pytest.param((1, 64, 10, 1000),     (1000, 10),  marks=pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)),           #7        # 4.4 Extreme ratios between height/width       
+                    pytest.param((1, 64, 32, 64),       (64, 32),    marks=pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)),           #10       # 4.1 Divisible by 32                           
+                    pytest.param((1, 64, 17, 41),       (41, 17),    marks=pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)),           #12       # 4.2 Prime numbers                             
             
-                    pytest.param((1, 64, 9920, 1),      (1, 9920),   marks=pytest.mark.skip(reason="Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),      #8        # 4.4 Extreme ratios between height/width     
-                    pytest.param((1, 64, 10000, 1),     (1, 10000),  marks=pytest.mark.skip(reason="Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),      #9        # 4.4 Extreme ratios between height/width     
+                    # "Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown"
+                    pytest.param((1, 64, 9920, 1),      (1, 9920),   marks=pytest.mark.skip(reason=FailingReasons.SEMAPHORE_LEAK)),           #8        # 4.4 Extreme ratios between height/width     
+                    pytest.param((1, 64, 10000, 1),     (1, 10000),  marks=pytest.mark.skip(reason=FailingReasons.SEMAPHORE_LEAK)),           #9        # 4.4 Extreme ratios between height/width     
         ])
 def test_smm_operand_src_from_tm_edge3(
     input_shape_dense,
@@ -411,7 +415,8 @@ def get_input_shapes_prologued():
             ((2, 64, 1, 23),     (23, 1),       None,   True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
             ((2, 64, 64, 1),     (1, 64),       None,   True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
             ((2, 64, 100, 100),  (100, 100),    None,   True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
-            pytest.param((2, 64, 1000, 100), (100, 1000),   None,   True, marks=pytest.mark.skip(reason="Fatal python error - xfail does not work. Error message: Fatal Python error: Segmentation fault; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown")),  # 4.3 Very large (thousands, 10s of thousands)         
+            # "Fatal python error - xfail does not work. Error message: Fatal Python error: Segmentation fault; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown"
+            pytest.param((2, 64, 1000, 100), (100, 1000),   None,   True, marks=pytest.mark.skip(reason=FailingReasons.SEMAPHORE_LEAK)),  # 4.3 Very large (thousands, 10s of thousands)         
             ((2, 64, 10, 1000),  (1000, 10),    None,   True) ,  #29       # 4.4 Extreme ratios between height/width        
             ((2, 64, 9920, 1),   (1, 9920),     None,   True) ,  #30       # 4.4 Extreme ratios between height/width 
             ((2, 64, 10000, 1),  (1, 10000),    None,   True) ,  #31       # 4.4 Extreme ratios between height/width   
diff --git a/pybuda/test/operators/nary/test_concatenate.py b/pybuda/test/operators/nary/test_concatenate.py
index 3d21e4ce..0b2e3176 100644
--- a/pybuda/test/operators/nary/test_concatenate.py
+++ b/pybuda/test/operators/nary/test_concatenate.py
@@ -64,6 +64,7 @@
 from pybuda.config import _get_global_compiler_config
 from pybuda.verify import TestKind, verify_module
 from test.operators.utils import netlist_utils
+from test.operators.utils import FailingReasons
 
 
 # Concatenate operator doesn't work for axis is equal to 0.
@@ -87,7 +88,8 @@
                  (1, 3, 3, 3),      # shape4 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
                  (1, 3, 3, 3, 3)    # shape5 - test fails. Message: "pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3"
                ]
-@pytest.mark.xfail(reason="Concatenate operator doesn't work for axis value of 0.")
+# Concatenate operator doesn't work for axis value of 0.
+@pytest.mark.xfail(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", input_shapes)
 def test_concatenate_invalid_axis(test_device, axis, input_shape, input_params=[], math_fidelity=None):
@@ -283,7 +285,8 @@ def forward(self, x, y):
 #   2.4 From DRAM, but prologued (constant)
 #    - Constants must be small enough to fit into L1
 #    - Input are not prologued for microbatch size = 1
-@pytest.mark.parametrize("axis", [pytest.param(-3, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3], but pass fo")),
+# FAILING FOR axis=[-3], but pass for others
+@pytest.mark.parametrize("axis", [pytest.param(-3, marks=pytest.mark.xfail(reason=FailingReasons.UNSUPORTED_AXIS)),
                                   pytest.param(-2),
                                   pytest.param(-1),
                                   pytest.param(1),
@@ -300,11 +303,16 @@ def forward(self, x, y):
     pytest.param((2, 1, 15),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
     pytest.param((2, 50, 1),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
     pytest.param((2, 100, 100),  None, True),                                                                   # 4.3 Very large (thousands, 10s of thousands)  - FAILING FOR axis=[-3]
-    pytest.param((2, 100, 1000), None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.3 Very large (thousands, 10s of thousands)
-    pytest.param((2, 1, 4991),   None, False, marks=pytest.mark.xfail(reason="FAILING FOR for all axises")),    # 4.4 Extreme ratios between height/width
-    pytest.param((2, 1, 10000),  None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.4 Extreme ratios between height/width
-    pytest.param((2, 8191, 1),   None, False, marks=pytest.mark.xfail(reason="FAILING FOR for all axises")),    # 4.4 Extreme ratios between height/width
-    pytest.param((2, 10000, 1),  None, False, marks=pytest.mark.xfail(reason="FAILING FOR axis=[-3, -1, 2]")),  # 4.4 Extreme ratios between height/width
+    # FAILING FOR axis=[-3, -1, 2]
+    pytest.param((2, 100, 1000), None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.3 Very large (thousands, 10s of thousands)
+    # FAILING FOR for all axises
+    pytest.param((2, 1, 4991),   None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    # FAILING FOR axis=[-3, -1, 2]
+    pytest.param((2, 1, 10000),  None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    # FAILING FOR for all axises
+    pytest.param((2, 8191, 1),   None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    # FAILING FOR axis=[-3, -1, 2]
+    pytest.param((2, 10000, 1),  None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
     pytest.param((2, 32, 32),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
     pytest.param((2, 96, 96),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
     pytest.param((2, 13, 97),    None, True),                                                                   # 4.2 Prime numbers                             - FAILING FOR axis=[-3]
@@ -449,8 +457,8 @@ def forward(self, x, y):
 
 number_of_operands = [
                        pytest.param(3),   # all passes.
-                       pytest.param(4, marks=pytest.mark.xfail(reason="fails only for GOLDEN_WORMHOLE_BO=1")),   # fails only for GOLDEN_WORMHOLE_BO=1
-                       pytest.param(7, marks=pytest.mark.xfail(reason="fails in any case")),   
+                       pytest.param(4, marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),   # fails only for GOLDEN_WORMHOLE_BO=1
+                       pytest.param(7, marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),   # fails in any case
                             # Error message:
                             # ...
                             # [Golden-7-input_shape6--1] - RuntimeError: 1 Nodes have no valid grids, exiting
@@ -458,7 +466,7 @@ def forward(self, x, y):
                             # [Golden-7-input_shape7--2] - RuntimeError: 1 Nodes have no valid grids, exiting
                             # [Golden-7-input_shape7-1] - RuntimeError: 1 Nodes have no valid grids, exiting
                             # ...
-                       pytest.param(15, marks=pytest.mark.xfail(reason="fails in any case")),   
+                       pytest.param(15, marks=pytest.mark.xfail(reason=FailingReasons.INFERENCE_FAILED)),  # fails in any case
                             # Error message:
                             # ...
                             # [Golden-15-input_shape6--1] - RuntimeError: TT_ASSERT @ pybuda/csrc/balancer/balancer_utils.cpp:238: shape.ct % factor == 0
diff --git a/pybuda/test/operators/nary/test_index_copy.py b/pybuda/test/operators/nary/test_index_copy.py
index 677021ca..cd753218 100644
--- a/pybuda/test/operators/nary/test_index_copy.py
+++ b/pybuda/test/operators/nary/test_index_copy.py
@@ -64,6 +64,8 @@
 from pybuda.config import _get_global_compiler_config
 from pybuda.verify import TestKind, verify_module
 
+from test.operators.utils import FailingReasons
+
 # IndexCopy operator in PyBuda works in case of index is vector of one element
 def test_index_copy_torch_and_buda_1():
 
@@ -94,7 +96,8 @@ def test_index_copy_torch_and_buda_1():
 
 # Case of IndexCopy operator is not working
 # In PyTorch, index can be tensor of any shape, but in PyBuda, it can be only vector of one element
-@pytest.mark.xfail(reason="IndexCopy operator does not work")
+# IndexCopy operator does not work
+@pytest.mark.xfail(reason=FailingReasons.NOT_IMPLEMENTED)
 def test_index_copy_torch_and_buda_2():
 
     zeros_torch = torch.zeros(6, 3)
@@ -132,7 +135,8 @@ def test_index_copy_torch_and_buda_2():
 #  ...
 # "
 @pytest.mark.parametrize("input_shape", [(2, 3, 3)])
-@pytest.mark.xfail(reason="IndexCopy operator does not work on any device.")
+# IndexCopy operator does not work on any device.
+@pytest.mark.xfail(reason=FailingReasons.NOT_IMPLEMENTED)
 def test_index_copy_via_model(test_device, input_shape, input_params=[], math_fidelity=None):
 
     class Model(PyBudaModule):
diff --git a/pybuda/test/operators/nary/test_stack.py b/pybuda/test/operators/nary/test_stack.py
index 3794b050..1fb6a69e 100644
--- a/pybuda/test/operators/nary/test_stack.py
+++ b/pybuda/test/operators/nary/test_stack.py
@@ -69,6 +69,7 @@
 from test.operators.utils import InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
 from test.operators.utils import NetlistValidation
+from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 
 
@@ -100,9 +101,10 @@ def verify(model: PyBudaModule, test_device: TestDevice, input_shape: TensorShap
 #      [Golden-input_shape0-0] - pybuda._C.UnsupportedHWOpsError: Splice op can only operate on dims 1, 2, or 3
 #      [Golden-input_shape0-2] - RuntimeError: TT_ASSERT @ pybuda/csrc/graph_lib/shape.cpp:114: (i >= 0) && (i < (int)dims_.size())
 #      ..."
+# Bug: Stack operator doesn't work for axis values different of 1.
 axises = [-3, -2, -1, 0, 1, 2]
 input_shapes = [(1, 3, 3)]
-@pytest.mark.skip("Bug: Stack operator doesn't work for axis values different of 1.")
+@pytest.mark.skip(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", input_shapes)
 def test_stack_invalid_axis(test_device, axis, input_shape):
@@ -128,8 +130,9 @@ def forward(self, x, y):
 
 # Stack operator works in PyTorch and PyBuda well for all axises except
 # that PyBuda doesn't work for axis = -2 and axis = -1.
+# Stack operator doesn't work for axis values equal to -2 or -1.
 axises = [-3, -2, -1, 0, 1, 2]
-@pytest.mark.skip("Stack operator doesn't work for axis values equal to -2 or -1.")
+@pytest.mark.skip(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("axis", axises)
 def test_stack_torch_and_buda(axis):
 
@@ -169,11 +172,12 @@ def test_stack_torch_and_buda(axis):
 #      [Golden-input_shape2-1] - AssertionError
 #      ============================================== 11 failed, 1 passed in 2.40s ===========================================
 #      ..." 
+# Stack operator doesn't work when the input is not 2-dimensional tensor.
 axises = [-2 , -1, 0, 1]
 input_shapes = [(1, 3),       # vector, always fails
                 (1, 1, 3),    # should be reduced to vector, unexpectedly works
                 (1, 3, 3, 3)] # 3-dimensional tensor, always fails
-@pytest.mark.skip("Stack operator doesn't work when the input is not 2-dimensional tensor.")
+@pytest.mark.skip(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", input_shapes)
 def test_stack_invalid_shape(test_device, axis, input_shape):
diff --git a/pybuda/test/operators/nary/test_where.py b/pybuda/test/operators/nary/test_where.py
index f9fce3f2..72ff5064 100644
--- a/pybuda/test/operators/nary/test_where.py
+++ b/pybuda/test/operators/nary/test_where.py
@@ -16,7 +16,11 @@
 from pybuda import PyBudaModule, VerifyConfig
 from pybuda.verify import TestKind, verify_module
 
-@pytest.mark.skip(reason="This test is failing due to not supporting 'BoolTensor' for a condition")
+from test.operators.utils import FailingReasons
+
+
+# This test is failing due to not supporting 'BoolTensor' for a condition
+@pytest.mark.xfail(reason=FailingReasons.UNSUPPORTED_TYPE_FOR_VALIDATION)
 def test_cond_bool_tensor_manual_inputs(test_device):
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -56,7 +60,8 @@ def forward(self, cond, x, y):
         inputs=[(condition_tensor, x_tensor, y_tensor)],
     )
 
-@pytest.mark.skip(reason="This test is failing when condition_tensor elements have values <> 0.0 or 1.0")
+# This test is failing when condition_tensor elements have values <> 0.0 or 1.0
+@pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)
 def test_cond_non_bool_tensor_manual_inputs(test_device):
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -94,7 +99,8 @@ def forward(self, cond, x, y):
         inputs=[(condition_tensor, x_tensor, y_tensor)],
     )
 
-@pytest.mark.skip(reason="This test is failing due assertion error - data mismatch detected")
+# This test is failing due assertion error - data mismatch detected
+@pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)
 @pytest.mark.parametrize("input_shape", [(1, 3, 3)])
 def test_where_input_shapes(test_device, input_shape):
     class Model(PyBudaModule):
@@ -130,8 +136,11 @@ def forward(self, cond, x, y):
                   [1., 0.],
                   [1., 0.]]]
 
-@pytest.mark.skip(reason="This test is failing due to verify_module calculates wrong pcc")
-@pytest.mark.parametrize("cond_values", [cond_values_1, cond_values_2])
+# This test is failing due to verify_module calculates wrong pcc
+@pytest.mark.parametrize("cond_values", [
+    pytest.param(cond_values_1),
+    pytest.param(cond_values_2, marks=pytest.mark.xfail(reason=FailingReasons.DATA_MISMATCH)),
+])
 def test_where_verify_module(test_device, cond_values):
     class Model(PyBudaModule):
         def __init__(self, name):
diff --git a/pybuda/test/operators/utils/__init__.py b/pybuda/test/operators/utils/__init__.py
index a925e631..056a223c 100644
--- a/pybuda/test/operators/utils/__init__.py
+++ b/pybuda/test/operators/utils/__init__.py
@@ -9,6 +9,7 @@
 from .utils import NetlistValidation
 from .utils import LoggerUtils
 from .netlist_utils import read_netlist_value
+from .failing_reasons import FailingReasons
 
 __all__ = [
     'read_netlist_value',
@@ -19,4 +20,5 @@
     'VerifyUtils',
     'NetlistValidation',
     'LoggerUtils',
+    "FailingReasons",
 ]
diff --git a/pybuda/test/operators/utils/failing_reasons.py b/pybuda/test/operators/utils/failing_reasons.py
new file mode 100644
index 00000000..e10c33f9
--- /dev/null
+++ b/pybuda/test/operators/utils/failing_reasons.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+# Failing reasons for pytests marks
+
+
+# Compilation failed
+# Inference failed
+# Validation failed
+# Special case not supported
+# ...
+
+
+class FailingReasons:
+    NOT_IMPLEMENTED = "Not implemented operator"
+
+    BUGGY_SHAPE = "Buggy shape"
+
+    MICROBATCHING_UNSUPPORTED = "Higher microbatch size is not supported"
+
+    UNSUPPORTED_DIMENSION = "Unsupported dimension"
+
+    UNSUPORTED_AXIS = "Unsupported axis parameter"
+
+    UNSUPPORTED_PARAMETER_VALUE = "Unsupported parameter value"
+
+    UNSUPPORTED_SPECIAL_CASE = "Unsupported special case"
+
+    # Error message: E           RuntimeError: TT_ASSERT @ pybuda/csrc/passes/lowering_context.cpp:28: old_node->node_type() != graphlib::NodeType::kPyOp
+    # Error for input shape (1, 1, 10000, 1). Error message: RuntimeError: TT_ASSERT @ pybuda/csrc/placer/lower_to_placer.cpp:245:
+    COMPILATION_FAILED = "Model compilation failed"
+
+    # Error message: E           AssertionError: Error during inference
+    INFERENCE_FAILED = "Inference failed"
+
+    # "Error message: E          AssertionError: Data mismatch detected"
+    # Validation error caused by pcc threshold
+    DATA_MISMATCH = "Verification failed due to data mismatch"
+
+    UNSUPPORTED_TYPE_FOR_VALIDATION  = "Verification failed due to unsupported type in verify_module"
+
+    # "Fatal python error - xfail does not work; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown"
+    # "Fatal python error - xfail does not work. Error message: Fatal Python error: Segmentation fault; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown"
+    SEMAPHORE_LEAK = "Semaphore leak"
+
+    # RuntimeError: Fatal Python error: Segmentation fault
+    SEG_FAULT = "Inference failed due to seg fault"
+
+    UNSUPPORTED_INPUT_SOURCE = "Unsupported input source"
+

From 91752ea376980ab029204fe059206f7c6bddf095 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 14 Aug 2024 16:07:56 +0000
Subject: [PATCH 092/116] handle change rank on pattern replacement

(cherry picked from commit 004d3ef8a0c65b7eb561e0154ff99b697afe4c88)
---
 .../passes/fuse_redundant_tm_sequence.cpp     | 33 ++++++++++++++++---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
index 4765a32f..8d30dbb7 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
@@ -3,6 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "passes/fuse_redundant_tm_sequence.hpp"
 #include "passes/commute_utils.hpp"
+#include "reportify/reportify.hpp"
+#include "autograd/binding.hpp"
 
 using tt::LogTMFusion;
 namespace tt::passes
@@ -201,7 +203,7 @@ bool replace_pattern_with_new_pattern(
     bool multiple_user = false;
     std::vector<graphlib::Node *> users;
     graphlib::Node * fuse_node = nullptr;
-    graphlib::Node *terminal_node = pattern_sequence.back();
+    graphlib::OpNode *terminal_node = pattern_sequence.back();
     pattern_sequence.pop_back();
 
     // Check whether the matched pattern has multiple user or not
@@ -246,6 +248,18 @@ bool replace_pattern_with_new_pattern(
         }
     }
 
+    std::string message = "Found replaceable TM sequence. Fuse from " + std::to_string(current_pattern.size()) + " tms into " + std::to_string(replace_pattern.size()) + " tms.";
+    message = message + " Input Shape: " + sequence_producer->shape().as_string();
+    message = message + " Pattern: ";
+
+    for (auto *op : pattern_sequence) {
+        message = message + op->op_type().as_string() + " -> ";
+    }
+    message = message + terminal_node->op_type().as_string() + " ===> ";
+    for (auto item : replace_pattern) {
+        message = message + " " + item.as_op_type().as_string() + " ";
+    }
+
     // remove the edges of the users if it is same op and same shape
     if (multiple_user) {
         for (auto& user : users) {
@@ -269,6 +283,12 @@ bool replace_pattern_with_new_pattern(
         std::string name = sequence_producer->name() + "_fused_tm_op_" + std::to_string(current_edge.edge_creation_id);
         auto new_node = graph->add_node(
             std::make_unique<graphlib::PyOpNode>(name, op.as_op_type()), graph->get_subgraph_id_for_node(sequence_producer->id()));
+
+        std::vector<graphlib::Shape> operand_shapes{graph->node_by_id(current_edge.producer_node_id)->shape()};
+        std::tuple<graphlib::Shape, std::vector<graphlib::DimBroadcast>> shape_data = get_op_shape(op.as_op_type(), operand_shapes, false, operand_shapes[0].get_tile_dim());
+
+        graphlib::Shape node_shape = std::get<0>(shape_data);
+        new_node->set_shape(node_shape);
         fuse_node = new_node;
         auto [new_in_edge, new_out_edge] = graphlib::insert_node_on_edge(graph, current_edge, new_node);
         current_edge = new_out_edge;
@@ -280,13 +300,18 @@ bool replace_pattern_with_new_pattern(
     // connect the edge of the users to the fused op
     if (multiple_user) {
         for (auto& user : users){
-            if (user != terminal_node)
+            if (user != terminal_node) {
                 graph->add_edge(fuse_node, user);
+            }
         }
     }
 
-    recalculate_shapes(graph);
-    log_info(LogTMFusion, "Found replaceable TM sequence. Fuse from {} tms into {} tms.", current_pattern.size(), replace_pattern.size());
+    for (graphlib::Edge user_edge : graph->user_data_edges(fuse_node)) {
+        handle_change_rank(graph, user_edge);
+    }
+    
+    log_info(LogTMFusion, "{}", message);
+    
     return true;
 }
 

From ea1a509ef97020e136c21e74acce73fe996afd36 Mon Sep 17 00:00:00 2001
From: Guangyu Feng <gfeng@tenstorrent.com>
Date: Tue, 20 Aug 2024 20:56:06 +0000
Subject: [PATCH 093/116] Adjust the order of sys path

Insert confidential_customer_models to the beginning, due to a conflict
with the datasets package.

(cherry picked from commit b377ec4ec8f51e4079c046397469e2d669a6540e)
---
 pybuda/test/benchmark/benchmark/models/openpose_hand.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/test/benchmark/benchmark/models/openpose_hand.py b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
index f5738821..68899324 100644
--- a/pybuda/test/benchmark/benchmark/models/openpose_hand.py
+++ b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
@@ -15,7 +15,7 @@ def openpose_hand(training: bool, config: str, microbatch: int, devtype: str, ar
     from pybuda._C.backend_api import BackendDevice
 
     # Import confidential model implementation
-    sys.path.append(os.path.join(os.path.dirname(__file__), '../../../../../', 'third_party/confidential_customer_models/'))
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../../../../', 'third_party/confidential_customer_models/'))
     from benchmarks.openpose import OpenPoseHandModel, transfer
 
     # Configurations

From 41da5d536cb7ad83bd45a17c9cae1d39a2563d63 Mon Sep 17 00:00:00 2001
From: Sterling Taylor <staylor@tenstorrent.com>
Date: Wed, 21 Aug 2024 16:55:02 +0000
Subject: [PATCH 094/116] Update file README.md

(cherry picked from commit 6d3f19fb57002d41a597caaa97012bea7ca1ac69)
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b89721dc..0029ccdc 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 
 <h3>
 
-[TT-Buda Docs](https://docs.tenstorrent.com/tenstorrent/v/tt-buda) | [Model Demos](https://github.com/tenstorrent/tt-buda-demos/tree/main/model_demos#models-support-table) 
+[TT-Buda Docs](https://docs.tenstorrent.com/pybuda/latest/index.html) | [Model Demos](https://github.com/tenstorrent/tt-buda-demos/tree/main/model_demos#models-support-table) 
 
 </h3>
 
@@ -36,7 +36,7 @@ https://github.com/tenstorrent/tt-buda-demos
 
 ## Docs
 
-See: [Docs](https://docs.tenstorrent.com/tenstorrent/v/tt-buda)
+See: [Docs](https://docs.tenstorrent.com/pybuda/latest/index.html)
 
 ## Build
 

From c981da356fb420de4e8e71319712d757688658af Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 20 Aug 2024 17:02:16 +0000
Subject: [PATCH 095/116] Failing reasons

Issue #2554 / #2787

(cherry picked from commit e022bde3575d897e3da7bff3eea00c079a8835f8)
---
 .../eltwise_binary/test_pytorch_binary.py     | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
index 3cf132b1..6f00516d 100644
--- a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
+++ b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
@@ -20,6 +20,7 @@
 from pybuda.op_repo import TensorShape
 from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
+from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 from test.random.rgg import RateLimiter
 
@@ -243,40 +244,40 @@ def get_input_shapes():
             # 2-dimensional shape, microbatch_size > 1:
             # All shapes fails for all operators
             pytest.param((3, 4),        #13      # 3.1 Full tensor (i.e. full expected shape)
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.run_in_pp]),
             pytest.param((45, 17),      #14      # 3.1 Full tensor (i.e. full expected shape)
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((64, 1),       #15      # 3.2 Tensor reduce on one or more dims to 1
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((100, 100),    #16      # 4.3 Very large (thousands, 10s of thousands)
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((1000, 100),   #17      # 4.3 Very large (thousands, 10s of thousands)
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((10, 1000),    #18      # 4.4 Extreme ratios between height/width
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((9920, 1),     #19      # 4.4 Extreme ratios between height/width  
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((10000, 1),    #20      # 4.4 Extreme ratios between height/width 
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((32, 64),      #21      # 4.1 Divisible by 32
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((160, 96),     #22      # 4.1 Divisible by 32
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
             pytest.param((17, 41),      #23      # 4.2 Prime numbers
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.run_in_pp]),
             pytest.param((89, 3),       #24      # 4.2 Prime numbers
-                         marks=[pytest.mark.xfail(reason="Skip shapes where microbatchsize > 1"),
+                         marks=[pytest.mark.xfail(reason=FailingReasons.MICROBATCHING_UNSUPPORTED),
                                 pytest.mark.slow]),
 
             # 3-dimensional shape, microbatch_size = 1:
@@ -423,7 +424,7 @@ def get_not_implemented_pytorch_binary_ops():
 @pytest.mark.parametrize("input_operator", get_not_implemented_pytorch_binary_ops())
 @pytest.mark.parametrize("model_type", MODEL_TYPES)
 @pytest.mark.parametrize("input_shape", input_shapes)
-@pytest.mark.xfail(reason="Skip not implemented operators")
+@pytest.mark.xfail(reason=FailingReasons.NOT_IMPLEMENTED)
 def test_not_implemented_pytorch_eltwise_binary_ops_per_test_plan(
     input_operator,
     model_type,

From 09642fe014d1b8c4d7ca3af75a0a55d10a0ad9a3 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 20 Aug 2024 15:36:17 +0000
Subject: [PATCH 096/116] Usage of NetlistValidation

Issue #2554 / #2787

(cherry picked from commit 2c0cc64719d79d03bd48e25bcced22986fd7bb4d)
---
 .../eltwise_binary/test_eltwise_binary.py     | 15 ++++++-------
 .../eltwise_unary/test_eltwise_unary.py       |  9 ++++----
 pybuda/test/operators/matmul/test_matmul.py   | 14 ++++++-------
 .../operators/matmul/test_sparse_matmul.py    | 21 +++++++------------
 .../test/operators/nary/test_concatenate.py   | 16 +++++++-------
 .../eltwise_binary/test_pytorch_binary.py     | 12 +++++------
 6 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index a29aa222..442c315e 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -70,8 +70,9 @@
 
 from pybuda import PyBudaModule
 from pybuda.op_repo import TensorShape
-from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
+from test.operators.utils import InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 
@@ -464,15 +465,15 @@ def test_eltwise_binary_ops_per_test_plan(
 
     # netlist validations:
 
-    file_path = VerifyUtils.get_netlist_filename()
+    netlist = NetlistValidation()
 
     if model_type == ModelFromDramQueue:
-        assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
-        assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+        assert netlist.get_value("/queues/x/loc") == 'dram'
+        assert netlist.get_value("/queues/y/loc") == 'dram'
 
     if model_type == ModelConstEvalPass:
         # Here we check there is no key with operator name in the netlist in graphs section
-        d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+        d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
         for key in d.keys():
             assert input_operator not in key
 
@@ -583,8 +584,8 @@ def test_eltwise_binary_ops_per_test_plan_dram_prologued(
     )
 
     # netlist validation:
-    file_path = VerifyUtils.get_netlist_filename()
-    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_" + input_operator + "0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/input_0_" + input_operator + "0")
     if should_prolog:
         assert d['prologue']
     else:
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
index 111c43ea..73e00573 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
@@ -66,7 +66,8 @@
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 from pybuda.verify.config import TestKind
 
-from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
+from test.operators.utils import InputSourceFlags, VerifyUtils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 
@@ -124,12 +125,12 @@ def verify(
         math_fidelity=input_math_fidelity,
     )
 
-    file_path = VerifyUtils.get_netlist_filename() 
+    netlist = NetlistValidation()
     match model:
         case "model_op_src_from_dram":
-            assert netlist_utils.read_netlist_value(file_path, "/queues/x1/loc") == 'dram'
+            assert netlist.get_value("/queues/x1/loc") == 'dram'
         case "model_op_src_const_inputs1":
-            d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+            d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
             for key in d.keys():
                 assert input_operator not in key
 
diff --git a/pybuda/test/operators/matmul/test_matmul.py b/pybuda/test/operators/matmul/test_matmul.py
index 29af7659..03bb7b81 100644
--- a/pybuda/test/operators/matmul/test_matmul.py
+++ b/pybuda/test/operators/matmul/test_matmul.py
@@ -84,7 +84,7 @@
 
 from pybuda.op.eval.common import compare_tensor_to_golden
 
-from test.operators.utils import netlist_utils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 
 from .models import generic
@@ -376,13 +376,13 @@ def test_matmul_according_to_test_plan(
         input_params=[input_params],
     )
 
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
+    netlist = NetlistValidation()
     match model:
         case "model_op_src_from_dram2":
-            assert netlist_utils.read_netlist_value(file_path, "/queues/x1/loc") == 'dram'
-            assert netlist_utils.read_netlist_value(file_path, "/queues/x2/loc") == 'dram'
+            assert netlist.get_value("/queues/x1/loc") == 'dram'
+            assert netlist.get_value("/queues/x2/loc") == 'dram'
         case "model_op_src_const_inputs1":
-            d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+            d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
             for key in d.keys():
                 assert "Matmul" not in key
 
@@ -476,8 +476,8 @@ def test_matmul_dram_prologued(
         ),
     )
 
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_mm1")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/input_0_mm1")
     if prologue:
         assert d['prologue']
     else:
diff --git a/pybuda/test/operators/matmul/test_sparse_matmul.py b/pybuda/test/operators/matmul/test_sparse_matmul.py
index e5779464..ff89c283 100644
--- a/pybuda/test/operators/matmul/test_sparse_matmul.py
+++ b/pybuda/test/operators/matmul/test_sparse_matmul.py
@@ -46,7 +46,6 @@
 # (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
 # (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
 
-import os
 import pytest
 
 from pybuda.config import _get_global_compiler_config
@@ -61,15 +60,9 @@
 
 from pybuda.module import PyBudaModule
 
-from pybuda.module import PyBudaModule
-
-from pybuda import pybuda
-
 from pybuda.verify.backend import verify_module
-from pybuda.verify.config import VerifyConfig, TestKind
-import torch
 
-from test.operators.utils import netlist_utils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 
 
@@ -180,8 +173,8 @@ def forward(self, dense):
         ),
         input_params=[input_params],
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    assert netlist_utils.read_netlist_value(file_path, "/queues/dense/loc") == 'dram'
+    netlist = NetlistValidation()
+    assert netlist.get_value("/queues/dense/loc") == 'dram'
 
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_const_inputs_const_eval(
@@ -224,8 +217,8 @@ def forward(self, x1, x2):
         ),
         input_params=[input_params],
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
     for key in d.keys():
         assert "Matmul" not in key
 
@@ -463,8 +456,8 @@ def forward(self, x):
         ),
         input_params=[input_params],
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/lc.input_tensor.smm1.0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/lc.input_tensor.smm1.0")
     if prologue:
         assert d['prologue']
     else:
diff --git a/pybuda/test/operators/nary/test_concatenate.py b/pybuda/test/operators/nary/test_concatenate.py
index 0b2e3176..71cdd1d5 100644
--- a/pybuda/test/operators/nary/test_concatenate.py
+++ b/pybuda/test/operators/nary/test_concatenate.py
@@ -63,7 +63,7 @@
 from pybuda import PyBudaModule, VerifyConfig
 from pybuda.config import _get_global_compiler_config
 from pybuda.verify import TestKind, verify_module
-from test.operators.utils import netlist_utils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 
 
@@ -276,9 +276,9 @@ def forward(self, x, y):
         ),
         input_params=[input_params],
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
-    assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+    netlist = NetlistValidation()
+    assert netlist.get_value("/queues/x/loc") == 'dram'
+    assert netlist.get_value("/queues/y/loc") == 'dram'
 
 
 
@@ -354,8 +354,8 @@ def forward(self, x):
         ),
         input_params=[input_params],
     )
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    d = netlist_utils.read_netlist_value(file_path, "/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Concatenate0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Concatenate0")
     if should_prolog:
         assert d['prologue']
     else:
@@ -419,8 +419,8 @@ def forward(self, x, y):
         input_params=[input_params],
     )
     # Here we check there is no key with "Concatenate" in the netlist in graphs section
-    file_path = pybuda.pybudaglobal.get_devices()[0]._compile_output.netlist_filename
-    d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+    netlist = NetlistValidation()
+    d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
     for key in d.keys():
         assert "Concatenate" not in key
 
diff --git a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
index 6f00516d..68e890b0 100644
--- a/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
+++ b/pybuda/test/operators_pytorch/eltwise_binary/test_pytorch_binary.py
@@ -18,8 +18,9 @@
 import pybuda.op
 
 from pybuda.op_repo import TensorShape
-from test.operators.utils import netlist_utils, InputSourceFlags, VerifyUtils
+from test.operators.utils import InputSourceFlags, VerifyUtils
 from test.operators.utils import ShapeUtils
+from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
 from test.conftest import TestDevice
 from test.random.rgg import RateLimiter
@@ -378,15 +379,14 @@ def test_pytorch_eltwise_binary_ops_per_test_plan(
 
     # netlist validations:
 
-    file_path = VerifyUtils.get_netlist_filename()
-
+    netlist = NetlistValidation()
     if model_type == ModelFromDramQueue:
-        assert netlist_utils.read_netlist_value(file_path, "/queues/x/loc") == 'dram'
-        assert netlist_utils.read_netlist_value(file_path, "/queues/y/loc") == 'dram'
+        assert netlist.get_value("/queues/x/loc") == 'dram'
+        assert netlist.get_value("/queues/y/loc") == 'dram'
 
     if model_type == ModelConstEvalPass:
         # Here we check there is no key with operator name in the netlist in graphs section
-        d = netlist_utils.read_netlist_value(file_path, "/graphs/fwd_0_0_temporal_epoch_0")
+        d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
         for key in d.keys():
             if key == "target_device":
                 continue

From 4434e55eeb5d564c6877f2be15593c3e75808cf1 Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Tue, 20 Aug 2024 16:50:53 +0000
Subject: [PATCH 097/116] verify from VerifyUtils

Issue #2554 / #2787

(cherry picked from commit 85884a05c60d0c1d63a78392c12e5d340f5d1ef1)
---
 pybuda/test/operators/matmul/test_matmul.py   | 229 ++++++-----
 .../operators/matmul/test_sparse_matmul.py    | 377 +++++++++---------
 .../test/operators/nary/test_concatenate.py   | 355 ++++++++---------
 3 files changed, 480 insertions(+), 481 deletions(-)

diff --git a/pybuda/test/operators/matmul/test_matmul.py b/pybuda/test/operators/matmul/test_matmul.py
index 03bb7b81..94443da4 100644
--- a/pybuda/test/operators/matmul/test_matmul.py
+++ b/pybuda/test/operators/matmul/test_matmul.py
@@ -84,8 +84,13 @@
 
 from pybuda.op.eval.common import compare_tensor_to_golden
 
+from pybuda.op_repo import TensorShape
+
+from test.operators.utils import InputSourceFlags, VerifyUtils
+from test.operators.utils import ShapeUtils
 from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
+from test.conftest import TestDevice
 
 from .models import generic
 from .models import custom
@@ -101,6 +106,28 @@
 MODELS_TEST_PLAN_PATH = MODELS_PATH + "test_plan/"
 
 
+def verify(
+    test_device: TestDevice,
+    model: PyBudaModule,
+    input_shapes: List[TensorShape],
+    input_params: List[Dict] = [],
+    input_source_flag: InputSourceFlags = None,
+    dev_data_format: pybuda.DataFormat = None,
+    math_fidelity: pybuda.MathFidelity = None,
+):
+    '''Common verification function for all tests'''
+
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
+
+
 SHAPE_NO = 1
 SHAPE_SIZE_MIN = 2
 SHAPE_SIZE_MAX = 4
@@ -228,16 +255,12 @@ def test_matmul_according_to_pytorch_docs(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_training = False
-    compiler_cfg.input_queues_on_host = True
 
-    verify_module(
-        model,
+    verify(
+        test_device=test_device,
+        model=model,
         input_shapes=model.shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
+        input_source_flag=InputSourceFlags.FROM_HOST,
     )
 
 
@@ -281,7 +304,7 @@ def test_matmul_according_to_test_plan(
     model,
     input_shape,
     test_device,
-    input_params=[], 
+    dev_data_format=None, 
     math_fidelity=None
 ):
     if(model == "model_op_src_const_inputs2" and math_fidelity == None):
@@ -340,14 +363,14 @@ def test_matmul_according_to_test_plan(
 
     match model:
         case "model_op_src_from_dram1":
-            input_shape = (1,) + input_shape[1:]
+            input_shape = ShapeUtils.reduce_microbatch_size(input_shape)
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
         case "model_op_src_const_inputs1": 
-            input_shape = (1,) + input_shape[1:]
-            tr = (1,) + tr[1:]
+            input_shape = ShapeUtils.reduce_microbatch_size(input_shape)
+            tr = ShapeUtils.reduce_microbatch_size(tr)
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape}, {tr})'
         case "model_op_src_const_inputs2":
-            input_shape = (1,) + input_shape[1:]
+            input_shape = ShapeUtils.reduce_microbatch_size(input_shape)
             architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
         case _:
             architecture = f'test_plan.{model}.BudaMatmulTest()'
@@ -359,21 +382,17 @@ def test_matmul_according_to_test_plan(
     compiler_cfg.enable_training = False
     match model:
         case "model_op_src_from_dram2":
-            compiler_cfg.input_queues_on_host = False
+            input_source_flag = InputSourceFlags.FROM_DRAM
         case _:
-            compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    
-    verify_module(
-        model_eval,
+            input_source_flag = InputSourceFlags.FROM_HOST
+
+    verify(
+        test_device=test_device,
+        model=model_eval,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
     netlist = NetlistValidation()
@@ -391,49 +410,49 @@ def test_matmul_according_to_test_plan(
 def get_input_shapes_prologued():
                                               # Here we cover interesting combinations of input shapes:
     return [
-            ((2, 3, 4),         True, False),  #0        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 3, 4),         False, True),  #1        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 3, 4),         None, True),   #2        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 4),         True, False),  #3        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 4),         False, True),  #4        # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 3, 4),         None, True),   #5        # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
-            ((2, 45, 17),       None, True),   #6        # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 1, 23),        None, True),   #7        # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 64, 1),        None, True),   #8        # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 100, 100),     None, True),   #9        # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 1000, 100),    None, True),   #10       # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 10, 1000),     None, True),   #11       # 4.4 Extreme ratios between height/width
-            ((2, 9920, 1),      None, True),   #12       # 4.4 Extreme ratios between height/width
-            ((2, 10000, 1),     None, False),  #13       # 4.4 Extreme ratios between height/width
-            ((2, 32, 64),       None, True),   #14       # 4.1 Divisible by 32
-            ((2, 160, 96),      None, True),   #15       # 4.1 Divisible by 32
-            ((2, 17, 41),       None, True),   #16       # 4.2 Prime numbers
-            ((2, 89, 3),        None, True),   #17       # 4.2 Prime numbers
-
-            ((2, 1, 3, 4),      True, False),  #18       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 1, 3, 4),      False, True),  #19       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 1, 3, 4),      None, True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 1, 3, 4),      True, False),  #21       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 1, 3, 4),      False, True),  #22       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 1, 3, 4),      None, True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
-            ((2, 1, 45, 17),    None, True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 1, 1, 23),     None, True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 1, 64, 1),     None, True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 1, 100, 100),  None, True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 1, 1000, 100), None, True) ,  #28       # 4.3 Very large (thousands, 10s of thousands)
-            ((2, 1, 10, 1000),  None, True) ,  #29       # 4.4 Extreme ratios between height/width
-            ((2, 1, 9920, 1),   None, True) ,  #30       # 4.4 Extreme ratios between height/width 
-            ((2, 1, 10000, 1),  None, True) ,  #31       # 4.4 Extreme ratios between height/width   
-            ((2, 1, 32, 64),    None, True) ,  #32       # 4.1 Divisible by 32
-            ((2, 1, 160, 96),   None, True) ,  #33       # 4.1 Divisible by 32
-            ((2, 1, 17, 41),    None, True) ,  #34       # 4.2 Prime numbers
-            ((2, 1, 89, 3),     None, True) ,  #35       # 4.2 Prime numbers
+            ((2, 3, 4),         InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),  #0        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 4),         InputSourceFlags.FROM_DRAM_PROLOGUED,                True),   #1        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 3, 4),         InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #2        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),  #3        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         InputSourceFlags.FROM_DRAM_PROLOGUED,                True),   #4        # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 3, 4),         InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #5        # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 45, 17),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #6        # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 23),        InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #7        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 1),        InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #8        # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 100, 100),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #9        # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1000, 100),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #10       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 10, 1000),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #11       # 4.4 Extreme ratios between height/width
+            ((2, 9920, 1),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #12       # 4.4 Extreme ratios between height/width
+            ((2, 10000, 1),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False),  #13       # 4.4 Extreme ratios between height/width
+            ((2, 32, 64),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #14       # 4.1 Divisible by 32
+            ((2, 160, 96),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #15       # 4.1 Divisible by 32
+            ((2, 17, 41),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #16       # 4.2 Prime numbers
+            ((2, 89, 3),        InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #17       # 4.2 Prime numbers
+
+            ((2, 1, 3, 4),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),  #18       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 3, 4),      InputSourceFlags.FROM_DRAM_PROLOGUED,                True),   #19       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 3, 4),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),  #21       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      InputSourceFlags.FROM_DRAM_PROLOGUED,                True),   #22       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 1, 3, 4),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 1, 45, 17),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 1, 1, 23),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 1, 64, 1),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 1, 100, 100),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 1000, 100), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #28       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 1, 10, 1000),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #29       # 4.4 Extreme ratios between height/width
+            ((2, 1, 9920, 1),   InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #30       # 4.4 Extreme ratios between height/width 
+            ((2, 1, 10000, 1),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #31       # 4.4 Extreme ratios between height/width   
+            ((2, 1, 32, 64),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #32       # 4.1 Divisible by 32
+            ((2, 1, 160, 96),   InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #33       # 4.1 Divisible by 32
+            ((2, 1, 17, 41),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #34       # 4.2 Prime numbers
+            ((2, 1, 89, 3),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True) ,  #35       # 4.2 Prime numbers
             ]
 
-@pytest.mark.parametrize("input_shape, default_dram_params, prologue", get_input_shapes_prologued())
+@pytest.mark.parametrize("input_shape, input_source_flag, prologue", get_input_shapes_prologued())
 def test_matmul_dram_prologued(
     input_shape,
-    default_dram_params,
+    input_source_flag,
     prologue,
     test_device,
 ):
@@ -455,7 +474,7 @@ def test_matmul_dram_prologued(
         input_shapes.append(tr) 
     input_shapes = tuple(input_shapes)
 
-    input_shape = (1,) + input_shape[1:]
+    input_shape = ShapeUtils.reduce_microbatch_size(input_shape)
 
     architecture = f'test_plan.{model}.BudaMatmulTest({input_shape})'
     model_eval = eval(architecture)
@@ -463,17 +482,12 @@ def test_matmul_dram_prologued(
     # set compiler config file
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_training = False
-    compiler_cfg.input_queues_on_host = False
-    compiler_cfg.default_dram_parameters = default_dram_params
-    
-    verify_module(
-        model_eval,
+
+    verify(
+        test_device=test_device,
+        model=model_eval,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
+        input_source_flag=input_source_flag,
     )
 
     netlist = NetlistValidation()
@@ -487,9 +501,9 @@ def test_matmul_dram_prologued(
 def get_input_shape(microbatch_size1=1, microbatch_size2=1):
     return (microbatch_size1, microbatch_size2, 11, 37)
 
-verify_input_params=[ 
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                    ]
+dev_data_formats=[ 
+    pybuda.DataFormat.Float16_b,
+]
 
 compiler_math_fidelity = [
                             pybuda.MathFidelity.LoFi,
@@ -499,35 +513,40 @@ def get_input_shape(microbatch_size1=1, microbatch_size2=1):
                          ]
 
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_TEST_PLAN_PATH) if "model" in item])
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_matmul_mf_inputs(model, test_device, math_fidelity):
-    test_matmul_according_to_test_plan(model, get_input_shape(), test_device, verify_input_params, math_fidelity);
-
-
-
-verify_input_params=[
-                        {"dev_data_format": pybuda.DataFormat.Bfp2},
-                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
-                        {"dev_data_format": pybuda.DataFormat.Float16},  
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                        {"dev_data_format": pybuda.DataFormat.Float32},
-                        {"dev_data_format": pybuda.DataFormat.Int8},
-                        {"dev_data_format": pybuda.DataFormat.Lf8},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
-                        {"dev_data_format": pybuda.DataFormat.UInt16},
-                    ]
-compiler_math_fidelity = pybuda.MathFidelity.HiFi4
+def test_matmul_mf_inputs(model, test_device, dev_data_format, math_fidelity):
+    test_matmul_according_to_test_plan(model, get_input_shape(), test_device, dev_data_format, math_fidelity)
+
+
+
+dev_data_formats=[
+    pybuda.DataFormat.Bfp2,
+    pybuda.DataFormat.Bfp2_b,
+    pybuda.DataFormat.Bfp4,
+    pybuda.DataFormat.Bfp4_b,
+    pybuda.DataFormat.Bfp8,
+    pybuda.DataFormat.Bfp8_b,
+    pybuda.DataFormat.Float16,  
+    pybuda.DataFormat.Float16_b,
+    pybuda.DataFormat.Float32,
+    pybuda.DataFormat.Int8,
+    pybuda.DataFormat.Lf8,
+    pybuda.DataFormat.RawUInt16,
+    pybuda.DataFormat.RawUInt32,
+    pybuda.DataFormat.RawUInt8,
+    pybuda.DataFormat.UInt16,
+]
+
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
 
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_TEST_PLAN_PATH) if "model" in item])
-@pytest.mark.parametrize("input_params", verify_input_params)
-def test_matmul_df_inputs(model, test_device, input_params):
-    test_matmul_according_to_test_plan(model, get_input_shape(), test_device, input_params, compiler_math_fidelity);
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_matmul_df_inputs(model, test_device, dev_data_format, math_fidelity):
+    test_matmul_according_to_test_plan(model, get_input_shape(), test_device, dev_data_format, math_fidelity)
 
 
 # from sanity
diff --git a/pybuda/test/operators/matmul/test_sparse_matmul.py b/pybuda/test/operators/matmul/test_sparse_matmul.py
index ff89c283..26da6cc3 100644
--- a/pybuda/test/operators/matmul/test_sparse_matmul.py
+++ b/pybuda/test/operators/matmul/test_sparse_matmul.py
@@ -48,6 +48,8 @@
 
 import pytest
 
+from typing import Dict, List
+
 from pybuda.config import _get_global_compiler_config
 from pybuda import Tensor
 import torch
@@ -60,11 +62,34 @@
 
 from pybuda.module import PyBudaModule
 
-from pybuda.verify.backend import verify_module
+from pybuda.op_repo import TensorShape
 
+from test.operators.utils import InputSourceFlags, VerifyUtils
 from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
+from test.conftest import TestDevice
+
+
+def verify(
+    test_device: TestDevice,
+    model: PyBudaModule,
+    input_shapes: List[TensorShape],
+    input_params: List[Dict] = [],
+    input_source_flag: InputSourceFlags = None,
+    dev_data_format: pybuda.DataFormat = None,
+    math_fidelity: pybuda.MathFidelity = None,
+):
+    '''Common verification function for all tests'''
 
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
 
 
 def get_input_shapes(micro_batch_size=1):
@@ -102,12 +127,13 @@ def get_sparse_tensor(shape, const_input = True):
     sparse = pybuda.Tensor.create_from_torch(sparse, constant=const_input)
     return sparse
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_host(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -123,27 +149,23 @@ def forward(self, dense):
     mod = Model("test_sparse_matmul_operand_src_from_host", input_shape_sparse)
     
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_dram(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -159,29 +181,26 @@ def forward(self, dense):
     mod = Model("test_sparse_matmul_operand_src_from_dram", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = False
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_DRAM,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     netlist = NetlistValidation()
     assert netlist.get_value("/queues/dense/loc") == 'dram'
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_const_inputs_const_eval(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -203,31 +222,28 @@ def forward(self, x1, x2):
 
     input_shape_dense_tr = (input_shape_dense[0],input_shape_dense[1],input_shape_dense[3],input_shape_dense[2])
     input_shapes = tuple([input_shape_dense, input_shape_dense_tr])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     netlist = NetlistValidation()
     d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
     for key in d.keys():
         assert "Matmul" not in key
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_another_op(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -244,27 +260,23 @@ def forward(self, x):
     mod = Model("test_sparse_matmul_operand_src_from_another_op", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_tm_edge1(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -282,27 +294,23 @@ def forward(self, x):
     mod = Model("test_sparse_matmul_operand_src_from_tm_edge1", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", get_input_shapes())
 def test_smm_operand_src_from_tm_edge2(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -321,21 +329,17 @@ def forward(self, x):
     mod = Model("test_sparse_matmul_operand_src_from_tm_edge2", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
+
 @pytest.mark.parametrize("input_shape_dense, input_shape_sparse", [
                     pytest.param((1, 64, 3, 4),         (4, 3)),                                                                #1    # 3.1 Full tensor (i.e. full expected shape)),
                     pytest.param((1, 64, 1, 23),        (23, 1)),                                                               #3        # 3.2 Tensor reduce on one or more dims to 1
@@ -361,7 +365,7 @@ def test_smm_operand_src_from_tm_edge3(
     input_shape_dense,
     input_shape_sparse,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -379,53 +383,48 @@ def forward(self, x):
     mod = Model("test_sparse_matmul_operand_src_from_tm_edge3", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = True
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=InputSourceFlags.FROM_HOST,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
 def get_input_shapes_prologued():
                                               # Here we cover interesting combinations of input shapes:
     return [
-            ((2, 64, 3, 4),      (4, 3),        True,   False),  #18       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 64, 3, 4),      (4, 3),        False,  True),  #19       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 64, 3, 4),      (4, 3),        None,   True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 64, 3, 4),      (4, 3),        True,   False),  #21       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 64, 3, 4),      (4, 3),        False,  True),  #22       # 3.1 Full tensor (i.e. full expected shape)
-            ((1, 64, 3, 4),      (4, 3),        None,   True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
-            ((2, 64, 45, 17),    (17, 45),      None,   True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
-            ((2, 64, 1, 23),     (23, 1),       None,   True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 64, 64, 1),     (1, 64),       None,   True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
-            ((2, 64, 100, 100),  (100, 100),    None,   True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
+            ((2, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,              False),  #18       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_PROLOGUED,                  True),   #19       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #20       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,              False),  #21       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_PROLOGUED,                  True),   #22       # 3.1 Full tensor (i.e. full expected shape)
+            ((1, 64, 3, 4),      (4, 3),        InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True),   #23       # 3.1 Full tensor (i.e. full expected shape) ! not working as described in docs
+            ((2, 64, 45, 17),    (17, 45),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #24       # 3.1 Full tensor (i.e. full expected shape)
+            ((2, 64, 1, 23),     (23, 1),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #25       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 64, 1),     (1, 64),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #26       # 3.2 Tensor reduce on one or more dims to 1
+            ((2, 64, 100, 100),  (100, 100),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #27       # 4.3 Very large (thousands, 10s of thousands)
             # "Fatal python error - xfail does not work. Error message: Fatal Python error: Segmentation fault; UserWarning: resource_tracker: There appear to be 26 leaked semaphore objects to clean up at shutdown"
-            pytest.param((2, 64, 1000, 100), (100, 1000),   None,   True, marks=pytest.mark.skip(reason=FailingReasons.SEMAPHORE_LEAK)),  # 4.3 Very large (thousands, 10s of thousands)         
-            ((2, 64, 10, 1000),  (1000, 10),    None,   True) ,  #29       # 4.4 Extreme ratios between height/width        
-            ((2, 64, 9920, 1),   (1, 9920),     None,   True) ,  #30       # 4.4 Extreme ratios between height/width 
-            ((2, 64, 10000, 1),  (1, 10000),    None,   True) ,  #31       # 4.4 Extreme ratios between height/width   
-            ((2, 64, 32, 64),    (64, 32),      None,   True) ,  #32       # 4.1 Divisible by 32
-            ((2, 64, 160, 96),   (96, 160),     None,   True) ,  #33       # 4.1 Divisible by 32
-            ((2, 64, 17, 41),    (41, 17),      None,   True) ,  #34       # 4.2 Prime numbers
-            ((2, 64, 89, 3),     (3, 89),       None,   True) ,  #35       # 4.2 Prime numbers
+            pytest.param((2, 64, 1000, 100), (100, 1000),   InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True, marks=pytest.mark.skip(reason=FailingReasons.SEMAPHORE_LEAK)),  # 4.3 Very large (thousands, 10s of thousands)         
+            ((2, 64, 10, 1000),  (1000, 10),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #29       # 4.4 Extreme ratios between height/width        
+            ((2, 64, 9920, 1),   (1, 9920),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #30       # 4.4 Extreme ratios between height/width 
+            ((2, 64, 10000, 1),  (1, 10000),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #31       # 4.4 Extreme ratios between height/width   
+            ((2, 64, 32, 64),    (64, 32),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #32       # 4.1 Divisible by 32
+            ((2, 64, 160, 96),   (96, 160),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #33       # 4.1 Divisible by 32
+            ((2, 64, 17, 41),    (41, 17),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #34       # 4.2 Prime numbers
+            ((2, 64, 89, 3),     (3, 89),       InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE,   True) ,  #35       # 4.2 Prime numbers
             ]
-@pytest.mark.parametrize("input_shape_dense, input_shape_sparse, default_dram_params, prologue", get_input_shapes_prologued())
+@pytest.mark.parametrize("input_shape_dense, input_shape_sparse, input_source_flag, prologue", get_input_shapes_prologued())
 def test_smm_operand_src_from_const_inputs_prologue(
     input_shape_dense,
     input_shape_sparse,
-    default_dram_params,
+    input_source_flag,
     prologue,
     test_device,
-    input_params=[], 
+    dev_data_format=None,
     math_fidelity=None
 ):
     class Model(PyBudaModule):
@@ -441,21 +440,16 @@ def forward(self, x):
     mod = Model("test_sparse_matmul_operand_src_from_const_inputs_prologue", input_shape_sparse)
 
     input_shapes = tuple([input_shape_dense])
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = False
-    compiler_cfg.default_dram_parameters = default_dram_params
-    if (math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-    verify_module(
-        mod,
+
+    verify(
+        test_device=test_device,
+        model=mod,
         input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     netlist = NetlistValidation()
     d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/lc.input_tensor.smm1.0")
     if prologue:
@@ -477,9 +471,10 @@ def get_input_shape_sparse(micro_batch_size=1):
 def get_input_shape_dense(micro_batch_size=1):
     return (micro_batch_size, 64, 3, 4)
 
-verify_input_params=[ 
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                    ]
+dev_data_formats=[ 
+    pybuda.DataFormat.Float16_b,
+]
+
 compiler_math_fidelity = [
                             pybuda.MathFidelity.LoFi,
                             pybuda.MathFidelity.HiFi2,
@@ -487,76 +482,90 @@ def get_input_shape_dense(micro_batch_size=1):
                             pybuda.MathFidelity.HiFi4,
                          ]
 
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_smm_mf_inputs_from_host(test_device, math_fidelity):
-    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+def test_smm_mf_inputs_from_host(test_device, dev_data_format, math_fidelity):
+    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_smm_mf_inputs_from_dram(test_device, math_fidelity):
-#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+# def test_smm_mf_inputs_from_dram(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_smm_mf_inputs_from_const_inputs_const_eval(test_device, math_fidelity):
-#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+# def test_smm_mf_inputs_from_const_inputs_const_eval(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_smm_mf_inputs_from_another_op(test_device, math_fidelity):
-#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+# def test_smm_mf_inputs_from_another_op(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_smm_mf_inputs_from_tm_edge1(test_device, math_fidelity):
-#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+# def test_smm_mf_inputs_from_tm_edge1(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
 # @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-# def test_smm_mf_inputs_from_tm_edge2(test_device, math_fidelity):
-#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, verify_input_params, math_fidelity)
+# def test_smm_mf_inputs_from_tm_edge2(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
 
 ## 2.
 
-verify_input_params=[
-                        {"dev_data_format": pybuda.DataFormat.Bfp2},
-                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
-                        {"dev_data_format": pybuda.DataFormat.Float16},  
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                        {"dev_data_format": pybuda.DataFormat.Float32},
-                        {"dev_data_format": pybuda.DataFormat.Int8},
-                        {"dev_data_format": pybuda.DataFormat.Lf8},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
-                        {"dev_data_format": pybuda.DataFormat.UInt16},
-                    ]
-
-compiler_math_fidelity = pybuda.MathFidelity.HiFi4
-
-@pytest.mark.parametrize("input_params", verify_input_params)
-def test_smm_df_inputs_from_host(test_device, input_params):
-    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
-
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_smm_df_inputs_from_dram(test_device, input_params):
-#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
-
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_smm_df_inputs_from_const_inputs_const_eval(test_device, input_params):
-#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
-
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_smm_df_inputs_from_another_op(test_device, input_params):
-#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
-
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_smm_df_inputs_from_tm_edge1(test_device, input_params):
-#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
-
-# @pytest.mark.parametrize("input_params", verify_input_params)
-# def test_smm_df_inputs_from_tm_edge2(test_device, input_params):
-#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, input_params, compiler_math_fidelity)
+dev_data_formats = [
+    pybuda.DataFormat.Bfp2,
+    pybuda.DataFormat.Bfp2_b,
+    pybuda.DataFormat.Bfp4,
+    pybuda.DataFormat.Bfp4_b,
+    pybuda.DataFormat.Bfp8,
+    pybuda.DataFormat.Bfp8_b,
+    pybuda.DataFormat.Float16,  
+    pybuda.DataFormat.Float16_b,
+    pybuda.DataFormat.Float32,
+    pybuda.DataFormat.Int8,
+    pybuda.DataFormat.Lf8,
+    pybuda.DataFormat.RawUInt16,
+    pybuda.DataFormat.RawUInt32,
+    pybuda.DataFormat.RawUInt8,
+    pybuda.DataFormat.UInt16,
+]
+
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
+
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_smm_df_inputs_from_host(test_device, dev_data_format, math_fidelity):
+    test_smm_operand_src_from_host(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
+
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_df_inputs_from_dram(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_dram(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
+
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_df_inputs_from_const_inputs_const_eval(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_const_inputs_const_eval(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
+
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_df_inputs_from_another_op(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_another_op(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
+
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_df_inputs_from_tm_edge1(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_tm_edge1(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
+
+# @pytest.mark.parametrize("dev_data_format", dev_data_formats)
+# @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+# def test_smm_df_inputs_from_tm_edge2(test_device, dev_data_format, math_fidelity):
+#     test_smm_operand_src_from_tm_edge2(get_input_shape_dense(), get_input_shape_sparse(), test_device, dev_data_format, math_fidelity)
 
 
 
diff --git a/pybuda/test/operators/nary/test_concatenate.py b/pybuda/test/operators/nary/test_concatenate.py
index 71cdd1d5..2cf212cc 100644
--- a/pybuda/test/operators/nary/test_concatenate.py
+++ b/pybuda/test/operators/nary/test_concatenate.py
@@ -60,11 +60,42 @@
 import pybuda
 import torch
 
-from pybuda import PyBudaModule, VerifyConfig
-from pybuda.config import _get_global_compiler_config
-from pybuda.verify import TestKind, verify_module
+from typing import List, Dict
+from loguru import logger
+
+from pybuda import PyBudaModule
+from pybuda.op_repo import TensorShape
+from test.operators.utils import InputSourceFlags, VerifyUtils
+from test.operators.utils import ShapeUtils
 from test.operators.utils import NetlistValidation
 from test.operators.utils import FailingReasons
+from test.conftest import TestDevice
+
+
+def verify(
+    test_device: TestDevice,
+    model: PyBudaModule,
+    input_shape: TensorShape,
+    number_of_operands: int,
+    input_params: List[Dict] = [],
+    input_source_flag: InputSourceFlags = None,
+    dev_data_format: pybuda.DataFormat = None,
+    math_fidelity: pybuda.MathFidelity = None,
+):
+    '''Common verification function for all tests'''
+
+    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
+    logger.trace(f"***input_shapes: {input_shapes}")
+
+    VerifyUtils.verify(
+        model=model,
+        test_device=test_device,
+        input_shapes=input_shapes,
+        input_params=input_params,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
+    )
 
 
 # Concatenate operator doesn't work for axis is equal to 0.
@@ -92,7 +123,7 @@
 @pytest.mark.xfail(reason=FailingReasons.UNSUPORTED_AXIS)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", input_shapes)
-def test_concatenate_invalid_axis(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_invalid_axis(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -103,23 +134,17 @@ def forward(self, x, y):
             return output
 
     mod = Model("test_concatenate_invalid_axis_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
 
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
+
 # setup of axises and shapes for all tests:
 axises = [-3, -2, -1, 1, 2]
 
@@ -143,7 +168,7 @@ def get_input_shapes(microbatch_size=1):
 #   2.1 From another op
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_another_operand(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_another_operand(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -157,21 +182,14 @@ def forward(self, x, y):
             return output
         
     mod = Model("test_concatenate_inputs_from_another_operand_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -179,7 +197,7 @@ def forward(self, x, y):
 #    - Combination: operator -> tm -> input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_tm_edge1(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_tm_edge1(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -192,21 +210,14 @@ def forward(self, x, y):
             return v3
         
     mod = Model("test_concatenate_inputs_from_tm_edge1_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -214,7 +225,7 @@ def forward(self, x, y):
 #    - tm -> input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_tm_edge2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_tm_edge2(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -227,21 +238,14 @@ def forward(self, x, y):
             return v3
         
     mod = Model("test_concatenate_inputs_from_tm_edge2_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -249,7 +253,7 @@ def forward(self, x, y):
 #    - input_queue flag = false
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_dram_queue(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_dram_queue(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -259,23 +263,17 @@ def forward(self, x, y):
             return pybuda.op.Concatenate("Concatenate0", x, y, axis=axis)
         
     mod = Model("test_concatenate_inputs_from_dram_queue_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.input_queues_on_host = False
-    if(math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        input_source_flag=InputSourceFlags.FROM_DRAM,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     netlist = NetlistValidation()
     assert netlist.get_value("/queues/x/loc") == 'dram'
     assert netlist.get_value("/queues/y/loc") == 'dram'
@@ -292,32 +290,32 @@ def forward(self, x, y):
                                   pytest.param(1),
                                   pytest.param(2)
                                   ])
-@pytest.mark.parametrize("input_shape, default_dram_params, should_prolog", [
-    pytest.param((2, 3, 3),      True, False),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
-    pytest.param((2, 3, 3),      False, True),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
-    pytest.param((2, 3, 3),      None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
-    pytest.param((1, 3, 3),      True, False),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - PASS
-    pytest.param((1, 3, 3),      False, True),                                                                  # 3.1 Full tensor (i.e. full expected shape)    - PASS
-    pytest.param((1, 3, 3),      None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - PASS - but not according to documentation!
-    pytest.param((2, 10, 5),     None, True),                                                                   # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
-    pytest.param((2, 1, 15),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
-    pytest.param((2, 50, 1),     None, True),                                                                   # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
-    pytest.param((2, 100, 100),  None, True),                                                                   # 4.3 Very large (thousands, 10s of thousands)  - FAILING FOR axis=[-3]
+@pytest.mark.parametrize("input_shape, input_source_flag, should_prolog", [
+    pytest.param((2, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),                                                              # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED,                True),                                                               # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((1, 3, 3),      InputSourceFlags.FROM_DRAM_NOT_PROLOGUED,            False),                                                              # 3.1 Full tensor (i.e. full expected shape)    - PASS
+    pytest.param((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUED,                True),                                                               # 3.1 Full tensor (i.e. full expected shape)    - PASS
+    pytest.param((1, 3, 3),      InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 3.1 Full tensor (i.e. full expected shape)    - PASS - but not according to documentation!
+    pytest.param((2, 10, 5),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 3.1 Full tensor (i.e. full expected shape)    - FAILING FOR axis=[-3]
+    pytest.param((2, 1, 15),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
+    pytest.param((2, 50, 1),     InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 3.2 Tensor reduce on one or more dims to 1    - FAILING FOR axis=[-3]
+    pytest.param((2, 100, 100),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 4.3 Very large (thousands, 10s of thousands)  - FAILING FOR axis=[-3]
     # FAILING FOR axis=[-3, -1, 2]
-    pytest.param((2, 100, 1000), None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.3 Very large (thousands, 10s of thousands)
+    pytest.param((2, 100, 1000), InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.3 Very large (thousands, 10s of thousands)
     # FAILING FOR for all axises
-    pytest.param((2, 1, 4991),   None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 1, 4991),   InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
     # FAILING FOR axis=[-3, -1, 2]
-    pytest.param((2, 1, 10000),  None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 1, 10000),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
     # FAILING FOR for all axises
-    pytest.param((2, 8191, 1),   None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 8191, 1),   InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
     # FAILING FOR axis=[-3, -1, 2]
-    pytest.param((2, 10000, 1),  None, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
-    pytest.param((2, 32, 32),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
-    pytest.param((2, 96, 96),    None, True),                                                                   # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
-    pytest.param((2, 13, 97),    None, True),                                                                   # 4.2 Prime numbers                             - FAILING FOR axis=[-3]
+    pytest.param((2, 10000, 1),  InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, False, marks=pytest.mark.xfail(reason=FailingReasons.BUGGY_SHAPE)),  # 4.4 Extreme ratios between height/width
+    pytest.param((2, 32, 32),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
+    pytest.param((2, 96, 96),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 4.1 Divisible by 32                           - FAILING FOR axis=[-3]
+    pytest.param((2, 13, 97),    InputSourceFlags.FROM_DRAM_PROLOGUE_MICROBATCH_SIZE, True),                                                               # 4.2 Prime numbers                             - FAILING FOR axis=[-3]
 ])
-def test_concatenate_inputs_from_dram_prologued(test_device, axis, input_shape, default_dram_params, should_prolog, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_dram_prologued(test_device, axis, input_shape, input_source_flag, should_prolog, dev_data_format=None, math_fidelity=None):
     
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -326,8 +324,7 @@ def __init__(self, name):
             def my_rand(*shape, requires_grad=False):
                 return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
 
-            t = input_shape[1:]
-            self.shape_input = (1, *t)
+            self.shape_input = ShapeUtils.reduce_microbatch_size(input_shape)
 
             self.add_constant("c")
             self.set_constant("c", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
@@ -338,22 +335,16 @@ def forward(self, x):
         
     mod = Model("test_concatenate_inputs_from_dram_prologued_model")
 
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.default_dram_parameters = default_dram_params
-    compiler_cfg.input_queues_on_host = False
-    if(math_fidelity is not None):
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=[input_shape],
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=1,
+        input_source_flag=input_source_flag,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     netlist = NetlistValidation()
     d = netlist.get_value("/programs/0/run_fwd_0/4/execute/queue_settings/input_0_Concatenate0")
     if should_prolog:
@@ -365,7 +356,7 @@ def forward(self, x):
 #   2.5 Const Inputs (const eval pass)
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_constants(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_constants(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
      
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -374,17 +365,13 @@ def __init__(self, name):
             def my_rand(*shape, requires_grad=False):
                 return (torch.rand(*shape, requires_grad=requires_grad) - 0.5).detach()
 
-            self.shape_input = input_shape
+            self.shape_input = ShapeUtils.reduce_microbatch_size(input_shape)
 
             self.add_constant("c1")
             self.set_constant("c1", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
 
             self.add_constant("c2")
             self.set_constant("c2", pybuda.Tensor.create_from_torch(my_rand(*self.shape_input), constant=True))
-       
-            self.inputs = [
-                pybuda.Tensor.create_from_torch(my_rand(*self.shape_input))
-            ]
 
         def forward(self, x, y):
             v1 = pybuda.op.Concatenate("Concatenate0", self.get_constant("c1"), self.get_constant("c2"), axis=axis)
@@ -402,22 +389,16 @@ def forward(self, x, y):
         i_shape = list(input_shape)
         i_shape[axis] = 2 * i_shape[axis]
         i_shape = tuple(i_shape)
-    input_shapes = tuple([i_shape for _ in range(2)])
-    
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=i_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
+
     # Here we check there is no key with "Concatenate" in the netlist in graphs section
     netlist = NetlistValidation()
     d = netlist.get_value("/graphs/fwd_0_0_temporal_epoch_0")
@@ -428,7 +409,7 @@ def forward(self, x, y):
 #   2.6 From host - case of two tensors as input
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
-def test_concatenate_inputs_from_host_2(test_device, axis, input_shape, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_host_2(test_device, axis, input_shape, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -438,21 +419,14 @@ def forward(self, x, y):
             return pybuda.op.Concatenate("Concatenate0", x, y, axis=axis)
         
     mod = Model("test_concatenate_inputs_from_host_2_model")
-    input_shapes = tuple([input_shape for _ in range(2)])
-
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
 
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=2,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 number_of_operands = [
@@ -480,7 +454,7 @@ def forward(self, x, y):
 @pytest.mark.parametrize("axis", axises)
 @pytest.mark.parametrize("input_shape", get_input_shapes(microbatch_size=1))
 @pytest.mark.parametrize("number_of_operands", number_of_operands)
-def test_concatenate_inputs_from_host_multiple_operands(test_device, axis, input_shape, number_of_operands, input_params=[], math_fidelity=None):
+def test_concatenate_inputs_from_host_multiple_operands(test_device, axis, input_shape, number_of_operands, dev_data_format=None, math_fidelity=None):
 
     class Model(PyBudaModule):
         def __init__(self, name):
@@ -490,21 +464,14 @@ def forward(self, *x):
             return pybuda.op.Concatenate("Concatenate0", *x, axis=axis)
         
     mod = Model("test_concatenate_inputs_from_host_multiple_operands")
-    input_shapes = tuple([input_shape for _ in range(number_of_operands)])
 
-    if(math_fidelity is not None):
-        compiler_cfg = _get_global_compiler_config()
-        compiler_cfg.default_math_fidelity = math_fidelity
-
-    verify_module(
-        mod,
-        input_shapes=input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-        input_params=[input_params],
+    verify(
+        test_device=test_device,
+        model=mod,
+        input_shape=input_shape,
+        number_of_operands=number_of_operands,
+        dev_data_format=dev_data_format,
+        math_fidelity=math_fidelity,
     )
 
 
@@ -523,9 +490,9 @@ def get_single_shape(microbatch_size=1):
 ### 1. ####################################################################################
 
 #   5.4 Operand DFs
-verify_input_params=[ 
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                    ]
+dev_data_formats=[ 
+    pybuda.DataFormat.Float16_b,
+]
 
 #  6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
 compiler_math_fidelity = [
@@ -536,36 +503,40 @@ def get_single_shape(microbatch_size=1):
                          ]
 
 
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
 @pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
-def test_concatenate_mf_inputs_from_another_operand(test_device, math_fidelity):
-    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), verify_input_params, math_fidelity)
+def test_concatenate_mf_inputs_from_another_operand(test_device, dev_data_format, math_fidelity):
+    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)
 
 
 ### 2. ####################################################################################
 
 #   5.4 Operand DFs
-verify_input_params=[
-                        {"dev_data_format": pybuda.DataFormat.Bfp2},
-                        {"dev_data_format": pybuda.DataFormat.Bfp2_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4},
-                        {"dev_data_format": pybuda.DataFormat.Bfp4_b},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8},
-                        {"dev_data_format": pybuda.DataFormat.Bfp8_b},
-                        {"dev_data_format": pybuda.DataFormat.Float16},  
-                        {"dev_data_format": pybuda.DataFormat.Float16_b},
-                        {"dev_data_format": pybuda.DataFormat.Float32},
-                        {"dev_data_format": pybuda.DataFormat.Int8},
-                        {"dev_data_format": pybuda.DataFormat.Lf8},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt16},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt32},
-                        {"dev_data_format": pybuda.DataFormat.RawUInt8},
-                        {"dev_data_format": pybuda.DataFormat.UInt16},
-                    ]
+dev_data_formats=[
+    pybuda.DataFormat.Bfp2,
+    pybuda.DataFormat.Bfp2_b,
+    pybuda.DataFormat.Bfp4,
+    pybuda.DataFormat.Bfp4_b,
+    pybuda.DataFormat.Bfp8,
+    pybuda.DataFormat.Bfp8_b,
+    pybuda.DataFormat.Float16,
+    pybuda.DataFormat.Float16_b,
+    pybuda.DataFormat.Float32,
+    pybuda.DataFormat.Int8,
+    pybuda.DataFormat.Lf8,
+    pybuda.DataFormat.RawUInt16,
+    pybuda.DataFormat.RawUInt32,
+    pybuda.DataFormat.RawUInt8,
+    pybuda.DataFormat.UInt16,
+]
 
 #  6. Math fidelity
-compiler_math_fidelity = pybuda.MathFidelity.HiFi4
+compiler_math_fidelity = [
+    pybuda.MathFidelity.HiFi4,
+]
 
 
-@pytest.mark.parametrize("input_params", verify_input_params)
-def test_concatenate_df_inputs_from_another_operand(test_device, input_params):
-    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), input_params, compiler_math_fidelity)
+@pytest.mark.parametrize("dev_data_format", dev_data_formats)
+@pytest.mark.parametrize("math_fidelity", compiler_math_fidelity)
+def test_concatenate_df_inputs_from_another_operand(test_device, dev_data_format, math_fidelity):
+    test_concatenate_inputs_from_another_operand(test_device, axis, get_single_shape(), dev_data_format, math_fidelity)

From 5afa0d98f5ee3a19565290745505fba3b8c53e61 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 22 Aug 2024 18:13:47 +0000
Subject: [PATCH 098/116] Add test for microsoft/phi-3-mini-4k-instruct
 (pytorch)

(cherry picked from commit 1263212b0a8195c42748253bd554bb7ded32864b)
---
 .../high_prio/nlp/pytorch/test_phi3.py        | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
new file mode 100644
index 00000000..121ed003
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -0,0 +1,120 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
+from transformers import Phi3Config, Phi3ForCausalLM, AutoTokenizer, Phi3ForTokenClassification
+import os
+import torch
+import pytest
+
+# Masked fill kernal produced invalid results in Silicon BackendType
+# So Disabling the verification in BBE for Silicon BackendType
+# Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
+
+variants = ["microsoft/phi-3-mini-4k-instruct"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi3_causal_lm(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "0"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "15360"
+
+    elif test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
+        os.environ["PYBUDA_DRAM_FLIP_FLOP"] = "1"
+
+    # Phi3Config from pretrained variant, disable return_dict and caching.
+    config = Phi3Config.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = Phi3Config(**config_dict)
+
+    # Load tokenizer and model from HuggingFace
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    model = Phi3ForCausalLM.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+
+    # input_prompt
+    input_prompt = "Africa is an emerging economy because"
+
+    # Tokenize input
+    inputs = tokenizer(
+        input_prompt,
+        return_tensors="pt",
+        max_length=256,
+        pad_to_max_length=True,
+        truncation=True,
+    )
+
+    input_ids = inputs["input_ids"].to(torch.int32)
+    attn_mask = inputs["attention_mask"].to(torch.float32)
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[input_ids.shape, attn_mask.shape],
+        inputs=[input_ids, attn_mask],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
+        ),
+    )
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi3_token_classification(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # Phi3Config from pretrained variant, disable return_dict and caching.
+    config = Phi3Config.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = Phi3Config(**config_dict)
+
+    # Load tokenizer and model from HuggingFace
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+
+    model = Phi3ForTokenClassification.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+
+    # input_prompt
+    input_prompt = "HuggingFace is a company based in Paris and New York"
+
+    # Tokenize input
+    inputs = tokenizer(input_prompt, return_tensors="pt")
+
+    input_ids = inputs["input_ids"]
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[input_ids.shape],
+        inputs=[input_ids],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )

From 07a74166e615981dce87dabe42a3a620acfab33c Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 22 Aug 2024 11:36:08 +0000
Subject: [PATCH 099/116] Avoid collecting TestKind and TestDevice

Avoid collecting TestKind and TestDevice as a pytest tests

(cherry picked from commit 0a68e2aef2afb405dfdf79a481fded857ed65c78)
---
 pybuda/pybuda/verify/config.py | 2 ++
 pybuda/test/conftest.py        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/pybuda/pybuda/verify/config.py b/pybuda/pybuda/verify/config.py
index 108e412e..b9b2b674 100644
--- a/pybuda/pybuda/verify/config.py
+++ b/pybuda/pybuda/verify/config.py
@@ -16,6 +16,8 @@
 
 
 class TestKind(Enum):
+    __test__ = False  # Avoid collecting TestKind as a pytest test
+
     INFERENCE = 1
     TRAINING = 2
     TRAINING_RECOMPUTE = 3
diff --git a/pybuda/test/conftest.py b/pybuda/test/conftest.py
index 06371506..b8e94619 100644
--- a/pybuda/test/conftest.py
+++ b/pybuda/test/conftest.py
@@ -179,6 +179,8 @@ def no_skip(*args, **kwargs):
 
 @dataclass
 class TestDevice:
+    __test__ = False  # Avoid collecting TestDevice as a pytest test
+
     devtype: BackendType
     arch: BackendDevice
     devmode: DeviceMode

From 87a00c3f9e8ba259482d396422fedba647eb656f Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <vbrkic@tenstorrent.com>
Date: Thu, 22 Aug 2024 10:33:38 +0000
Subject: [PATCH 100/116] Revert "Fix failing jobs in pipeline"

This reverts commit 47f97740ad5488e6dcf18c5d1e3cff18486206ae.

(cherry picked from commit 08da67674a6384290623634d47f86429e442b95b)
---
 pybuda/test/random/rgg/frameworks.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pybuda/test/random/rgg/frameworks.py b/pybuda/test/random/rgg/frameworks.py
index 0e5b3e23..d8bbbbcb 100644
--- a/pybuda/test/random/rgg/frameworks.py
+++ b/pybuda/test/random/rgg/frameworks.py
@@ -69,22 +69,22 @@ def set_calc_input_shapes(cls, framework: Framework, allow_operators: Tuple[str]
                 operator.calc_input_shapes = OperatorShapes.same_input_shapes
 
 
-def build_framework(framework_name: str, ModelBuilderType: Type[ModelBuilder], operator_repository: OperatorRepository):
-    framework = Framework(
-        framework_name=framework_name,
-        ModelBuilderType=ModelBuilderType,
-        operator_repository=operator_repository,
-    )
-
-    framework = FrameworkTestUtils.copy_framework(framework=framework, skip_operators=())
+class Frameworks(Enum):
+    ''' Register of all frameworks '''
 
-    FrameworkTestUtils.set_calc_input_shapes(framework)
+    @staticmethod
+    def build_framework(framework_name: str, ModelBuilderType: Type[ModelBuilder], operator_repository: OperatorRepository):
+        framework = Framework(
+            framework_name=framework_name,
+            ModelBuilderType=ModelBuilderType,
+            operator_repository=operator_repository,
+        )
 
-    return framework
+        framework = FrameworkTestUtils.copy_framework(framework=framework, skip_operators=())
 
+        FrameworkTestUtils.set_calc_input_shapes(framework)
 
-class Frameworks(Enum):
-    ''' Register of all frameworks '''
+        return framework
 
     PYBUDA = build_framework(
         framework_name="PyBuda",

From 06fb006aa37f5fc14f4a0e13a94f5e065bdb18df Mon Sep 17 00:00:00 2001
From: chandrasekaranpradeep <pchandrasekaran@tenstorrent.com>
Date: Mon, 26 Aug 2024 07:55:56 -0400
Subject: [PATCH 101/116] Fix openpose osmr body performance issue

(cherry picked from commit 240aba33a36e5f25bf91075db6ea8d568b389b84)
---
 pybuda/test/benchmark/benchmark/models/openpose_body.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pybuda/test/benchmark/benchmark/models/openpose_body.py b/pybuda/test/benchmark/benchmark/models/openpose_body.py
index 03d7d401..b8bd882c 100644
--- a/pybuda/test/benchmark/benchmark/models/openpose_body.py
+++ b/pybuda/test/benchmark/benchmark/models/openpose_body.py
@@ -28,6 +28,8 @@ def openpose_osmr_body(training: bool, config: str, microbatch: int, devtype: st
 
     os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "13"
     os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+    if config == "2d" and arch == "wormhole_b0" and data_type == "Fp16" and math_fidelity == "HiFi3":
+        os.environ["PYBUDA_DISABLE_ELU_HANDLE_INF"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     model_name = ""

From caaa16454821d30bdc94aae81323e424c12b6a1e Mon Sep 17 00:00:00 2001
From: pchandrasekaran <pchandrasekaran@tenstorrent.com>
Date: Mon, 26 Aug 2024 08:25:03 +0000
Subject: [PATCH 102/116] Add BTS densenet161 and densenet121 model on wh_b0

(cherry picked from commit 5be550d44e120b11d0fb16d09a32f17f3ce30d3e)
---
 pybuda/csrc/buda_passes.cpp                   |   1 -
 pybuda/csrc/passes/limit_to_4d_reshape.cpp    | 108 ------------------
 pybuda/csrc/passes/limit_to_4d_reshape.hpp    |   1 -
 .../high_prio/cnn/pytorch/test_bts.py         | 100 ++++++++++++++++
 4 files changed, 100 insertions(+), 110 deletions(-)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py

diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 033111ea..445af0ff 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -115,7 +115,6 @@ run_post_initial_graph_passes(graphlib::Graph *graph, py::object compiler_cfg_ob
     passes::fork_quantization_scales(graph);
     passes::remove_quant_dequant(graph);
     reportify::dump_graph(graph->name(), "post_quantize_commute", graph);
-    passes::decompose_nd_reshape_split(graph);
     passes::limit_to_4d_reshape(graph);
     passes::erase_unnecessary_4d_tm_sequence(graph);
     passes::fuse_pad_conv2d(graph);
diff --git a/pybuda/csrc/passes/limit_to_4d_reshape.cpp b/pybuda/csrc/passes/limit_to_4d_reshape.cpp
index de204f41..64d9433e 100644
--- a/pybuda/csrc/passes/limit_to_4d_reshape.cpp
+++ b/pybuda/csrc/passes/limit_to_4d_reshape.cpp
@@ -13,7 +13,6 @@
 namespace tt::passes
 {
 
-using Attr = BudaOpAttr;
 
 static bool is_reshape(graphlib::Node const *node)
 {
@@ -122,111 +121,4 @@ void limit_to_4d_reshape(graphlib::Graph *graph)
 
 
 
-
-template<typename T>
-bool all_have_same_dim_and_shape_stride1(std::vector<T> const &v) {
-    if (v.size() == 0) {
-        return false;
-    }
-
-    return std::all_of(v.begin(), v.end(), [&] (T const &e) {
-        auto attrs = dynamic_cast<graphlib::OpNode const *>(e)->op_attrs();
-        int dim = std::get<int>(attrs[0]);
-        return dim == std::get<int>(dynamic_cast<graphlib::OpNode const *>(v.front())->op_attrs()[0])
-                and e->shape() == v.front()->shape()
-                and std::get<int>(attrs[3]) == 1;
-    });
-}
-
-void decompose_nd_reshape_split(graphlib::Graph *graph) {
-
-    for (auto node : graph->nodes())
-    {
-        // Only consider reshape nodes with last dimension tile_dim aligned
-        if (not is_reshape(node) or node->shape()[-1] % graphlib::Shape::BUDA_TILE_DIM != 0)
-            continue;
-
-        auto consumers = graph->users(node);
-        bool all_consumers_are_index = all_of(consumers.begin(), consumers.end(), [](auto const &consumer) {
-            auto op = dynamic_cast<graphlib::OpNode const *>(consumer);
-            return op and op->op_name() == "index";
-        });
-
-        // All consumers must be index
-        if (not all_consumers_are_index)
-            continue;
-
-        bool all_index_have_same_dim_and_shape = all_have_same_dim_and_shape_stride1(consumers);
-
-        // All index must have same dim and shape
-        if (not all_index_have_same_dim_and_shape)
-            continue;
-
-        uint32_t total_index_size = 0;
-        int dim = std::get<int>(dynamic_cast<graphlib::OpNode const *>(consumers[0])->op_attrs()[0]);
-
-        for (auto const &consumer : consumers) {
-            total_index_size += consumer->shape()[dim];
-        }
-        bool index_all_channels = total_index_size == node->shape()[dim];
-        bool all_index_have_length1 = all_of(consumers.begin(), consumers.end(), [](auto const &consumer) {
-            auto op = dynamic_cast<graphlib::OpNode const *>(consumer);
-            return std::get<int>(op->op_attrs()[2]) - std::get<int>(op->op_attrs()[1]) == 1;
-        });
-
-        // All index must have length 1 and total indexed size must be equal to node dim
-        if (not (index_all_channels and all_index_have_length1))
-            continue;
-
-
-        bool all_sequence_consumers_are_squeeze = all_of(consumers.begin(), consumers.end(), [graph] (auto const &consumer) {
-            auto users = graph->users(consumer);
-            auto shape_before = consumer->shape();
-            auto shape_after = users[0]->shape();
-            auto op = dynamic_cast<graphlib::OpNode const *>(users[0]);
-            return users.size() == 1 and op->op_name() == "reshape" and shape_after.volume() == shape_before.volume()
-                    and shape_after.size() + 1 == shape_before.size();
-        });
-
-        // All consumers of Index must be reshape nodes that are equivalent to squeeze
-        if (not all_sequence_consumers_are_squeeze)
-            continue;
-
-        auto reshape_producer = graph->operands(node)[0];
-
-        // Remove reshape node and update index nodes to select
-        int new_dim = dim + 1;
-        int producer_dim_size = reshape_producer->shape()[new_dim];
-        TT_ASSERT(producer_dim_size % total_index_size == 0);
-        int new_dim_size = producer_dim_size / total_index_size;
-
-        for (uint32_t i = 0; i < consumers.size(); i++) {
-            auto op = dynamic_cast<graphlib::OpNode *>(consumers[i]);
-
-            auto op_type_ = op->op_type();
-            TT_ASSERT(op_type_.op == "index");
-            auto op_attrs = op->op_attrs();
-            op_type_.op = "select";
-
-            std::vector<Attr> new_op_attrs(4);
-            new_op_attrs[0] = new_dim;
-            new_op_attrs[1] = (int) (std::get<int>(op_attrs[1]) * node->shape()[-1]);
-            new_op_attrs[2] = (int) node->shape()[-1];
-            new_op_attrs[3] = (int) (consumers.size() * node->shape()[-1]);
-            op_type_.attr = new_op_attrs;
-            op->change_op_type(op_type_);
-
-            auto producer_shape = reshape_producer->shape().as_vector();
-            auto target_shape = graphlib::Shape::create(producer_shape);
-            target_shape[new_dim] = new_dim_size;
-            op->set_shape(target_shape);
-        }
-
-        graphlib::bypass_node(graph, node, true);
-
-        // Update node shapes in graph
-        recalculate_shapes(graph);
-    }
-}
-
 }  // namespace tt::passes
diff --git a/pybuda/csrc/passes/limit_to_4d_reshape.hpp b/pybuda/csrc/passes/limit_to_4d_reshape.hpp
index 157eb26e..53205f67 100644
--- a/pybuda/csrc/passes/limit_to_4d_reshape.hpp
+++ b/pybuda/csrc/passes/limit_to_4d_reshape.hpp
@@ -11,5 +11,4 @@ class Graph;
 namespace tt::passes
 {
 void limit_to_4d_reshape(graphlib::Graph *graph);
-void decompose_nd_reshape_split(graphlib::Graph *graph);
 }
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py
new file mode 100644
index 00000000..8209d16d
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py
@@ -0,0 +1,100 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+import torch
+from torchvision import transforms
+import numpy as np
+
+from PIL import Image
+import pytest
+import os
+import sys
+
+sys.path = list(
+    set(sys.path + ["third_party/confidential_customer_models/internal/bts/"])
+)
+
+from scripts.model import get_bts_model
+
+
+variants = ["densenet161_bts", "densenet121_bts"]
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_bts_pytorch(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        if variant == "densenet161_bts":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "94564"
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            compiler_cfg.enable_auto_fusing = False
+            compiler_cfg.balancer_op_override("multiply_196", "t_stream_shape", (1, 1))
+
+        elif variant == "densenet121_bts":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "76356"
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            compiler_cfg.enable_auto_fusing = False
+
+    # Load sample image
+    image_path = "third_party/confidential_customer_models/internal/bts/files/samples/rgb_00315.jpg"
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+    image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+    image = torch.from_numpy(image.transpose((2, 0, 1)))
+    image = normalize(image)
+    image = torch.unsqueeze(image, 0)
+
+    # Get the model
+    model = get_bts_model(variant)
+    checkpoint = torch.load(
+        "third_party/confidential_customer_models/internal/bts/files/weights/nyu/"
+        + str(variant)
+        + "/"
+        + str(variant)
+        + ".pt",
+        map_location=torch.device("cpu"),
+    )
+    model.load_state_dict(checkpoint)
+    model.eval()
+
+    class BtsModel_wrapper(torch.nn.Module):
+        def __init__(self, model, focal):
+            super().__init__()
+            self.model = model
+            self.focal = focal
+
+        def forward(self, input_tensor):
+            return self.model(input_tensor, self.focal)
+
+    bts_model_wrapper = BtsModel_wrapper(model, focal=518.8579)
+    bts_model_wrapper.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant), bts_model_wrapper)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(image.shape,)],
+        inputs=[(image,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            enabled=(
+                False if test_device.devtype == pybuda.BackendType.Silicon else True
+            ),
+        ),
+    )

From 8b76023641cf72f62c472c40d95d960f6326111a Mon Sep 17 00:00:00 2001
From: Kamalraj Kannan <kkannan@tenstorrent.com>
Date: Tue, 27 Aug 2024 16:52:00 +0000
Subject: [PATCH 103/116] Add test for sequence cls task for
 microsoft/phi-3-mini-4k-instruct -  pytorch(wb0)

(cherry picked from commit d599ec1031f8c4deee9bd4a857dc4ebe60f51ed4)
---
 .../high_prio/nlp/pytorch/test_phi3.py        | 55 +++++++++++++++++--
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
index 121ed003..60e92be1 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -3,9 +3,8 @@
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
 from pybuda._C.backend_api import BackendDevice
-from transformers import Phi3Config, Phi3ForCausalLM, AutoTokenizer, Phi3ForTokenClassification
+from transformers import Phi3Config, Phi3ForCausalLM, AutoTokenizer, Phi3ForTokenClassification, Phi3ForSequenceClassification
 import os
-import torch
 import pytest
 
 # Masked fill kernal produced invalid results in Silicon BackendType
@@ -57,8 +56,8 @@ def test_phi3_causal_lm(test_device, variant):
         truncation=True,
     )
 
-    input_ids = inputs["input_ids"].to(torch.int32)
-    attn_mask = inputs["attention_mask"].to(torch.float32)
+    input_ids = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
 
     tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
 
@@ -109,8 +108,52 @@ def test_phi3_token_classification(test_device, variant):
 
     verify_module(
         tt_model,
-        input_shapes=[input_ids.shape],
-        inputs=[input_ids],
+        input_shapes=[(input_ids.shape,)],
+        inputs=[(input_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi3_sequence_classification(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # Phi3Config from pretrained variant, disable return_dict and caching.
+    config = Phi3Config.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config_dict["pad_token_id"] = None
+    config = Phi3Config(**config_dict)
+
+    # Load tokenizer and model from HuggingFace
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    model = Phi3ForSequenceClassification.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+
+    # input_prompt
+    input_prompt = "the movie was great!"
+
+    # Tokenize input
+    inputs = tokenizer(input_prompt, return_tensors="pt")
+
+    input_ids = inputs["input_ids"]
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape)],
+        inputs=[(input_ids)],
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,

From a9c5b71daae8b499183358fe53eb970b181497f5 Mon Sep 17 00:00:00 2001
From: Guangyu Feng <gfeng@tenstorrent.com>
Date: Tue, 20 Aug 2024 20:51:43 +0000
Subject: [PATCH 104/116] Adjust epoch id after data parallel modifications

This fixes a hang (#2827).

(cherry picked from commit 56fc6881ceca7c026e9faef3135dfc973f1cdcfb)
---
 pybuda/csrc/lower_to_buda/netlist.cpp | 95 +++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/pybuda/csrc/lower_to_buda/netlist.cpp b/pybuda/csrc/lower_to_buda/netlist.cpp
index 22742514..e3af5eb0 100644
--- a/pybuda/csrc/lower_to_buda/netlist.cpp
+++ b/pybuda/csrc/lower_to_buda/netlist.cpp
@@ -697,10 +697,10 @@ std::pair<int, int> get_epoch_allocate_deallocate(graphlib::Node *q, const place
 }
 
 // Find out the updated epoch id after inserting empty epochs, only applies to n300 data parallel
-size_t get_updated_epoch_id(size_t epoch_id, const vector<size_t>& dp_epochs)
+uint32_t get_updated_epoch_id(uint32_t epoch_id, const vector<uint32_t>& dp_epochs)
 {
-    size_t num_of_insertions = 0;
-    for (size_t dp_epoch: dp_epochs)
+    uint32_t num_of_insertions = 0;
+    for (uint32_t dp_epoch: dp_epochs)
     {
         if (epoch_id > dp_epoch)
             num_of_insertions++;
@@ -708,9 +708,76 @@ size_t get_updated_epoch_id(size_t epoch_id, const vector<size_t>& dp_epochs)
     return epoch_id + num_of_insertions;
 }
 
+// Modifications over PlacerSolution::temporal_epoch_id(string)
+uint32_t get_dp_updated_temporal_epoch_id(
+        const std::string &node_name, const placer::PlacerSolution &placer_solution,
+        const vector<uint32_t> &dp_epochs)
+{
+    uint32_t global_epoch_id = placer_solution.name_to_op_placement.at(node_name).epoch_id();
+    uint32_t updated_global_epoch_id = get_updated_epoch_id(global_epoch_id, dp_epochs);
+    return placer_solution.epoch_id_to_epoch_info.at(updated_global_epoch_id).temporal_epoch_id;
+}
+
+// Modifications over get_consumer_epoch_ids()
+std::vector<std::uint32_t> get_updated_consumer_epoch_ids(
+    const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution,
+    const vector<uint32_t> &dp_epochs)
+{
+    std::vector<std::uint32_t> consumer_epoch_ids;
+    std::vector<graphlib::Node *> users = graph->data_users(node);
+    try
+    {
+        for (Node *user : users)
+        {
+            uint32_t temporal_epoch_id = get_dp_updated_temporal_epoch_id(user->name(), placer_solution, dp_epochs);
+            consumer_epoch_ids.push_back(get_updated_epoch_id(temporal_epoch_id, dp_epochs));
+        }
+        return consumer_epoch_ids;
+    }
+    catch (std::out_of_range &e)
+    {
+        log_fatal("Placement missing for a user of {}", node->name());
+        return {};
+    }
+}
+
+// Modifications over get_last_epoch_use()
+std::uint32_t get_updated_last_epoch_use(
+        const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution,
+        const vector<uint32_t> &dp_epochs)
+{
+    std::vector<std::uint32_t> consumer_epoch_ids = get_updated_consumer_epoch_ids(graph, node, placer_solution, dp_epochs);
+    return *std::max_element(consumer_epoch_ids.begin(), consumer_epoch_ids.end());
+}
+
+// Modifications over get_first_epoch_producer()
+std::uint32_t get_updated_first_epoch_producer(
+    const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution,
+    const vector<uint32_t> &dp_epochs)
+{
+    std::vector<graphlib::Node *> operands = graph->operands(node);
+    try
+    {
+        std::uint32_t min_epoch = get_dp_updated_temporal_epoch_id(operands[0]->name(), placer_solution, dp_epochs);
+        for (std::uint32_t i = 1; i < operands.size(); i++)
+        {
+            std::uint32_t epoch = get_dp_updated_temporal_epoch_id(operands[i]->name(), placer_solution, dp_epochs);
+            if (epoch < min_epoch)
+                min_epoch = epoch;
+        }
+        return min_epoch;
+    }
+    catch (std::out_of_range &e)
+    {
+        log_fatal("Placement missing for an operand of {}", node->name());
+        return 0;
+    }
+}
+
+
 std::vector<program::Program> create_programs(
     Graph *graph, placer::PlacerSolution &placer_solution, BudaGraph &buda_graph, const std::string &arch_string,
-    const vector<size_t> &dp_epochs)
+    const vector<uint32_t> &dp_epochs)
 {
     std::vector<program::Program> programs;
 
@@ -1015,11 +1082,11 @@ std::vector<program::Program> create_programs(
                     bool read_global;
                     if (q->as<graphlib::QueueNode>()->is_output())
                     {
-                        read_global = (temporal_epoch_id == get_first_epoch_producer(graph, q, placer_solution));
+                        read_global = (temporal_epoch_id == get_updated_first_epoch_producer(graph, q, placer_solution, dp_epochs));
                     }
                     else
                     {
-                        read_global = (temporal_epoch_id == get_last_epoch_use(graph, q, placer_solution));
+                        read_global = (temporal_epoch_id == get_updated_last_epoch_use(graph, q, placer_solution, dp_epochs));
                     }
                     auto [epoch_allocate, epoch_deallocate] = get_epoch_allocate_deallocate(q, placer_solution);
 
@@ -1450,7 +1517,8 @@ static std::vector<std::size_t> get_input_dram_io_buf_size_tiles(
             return input_dram_io_buf_size_tiles;
         }
 
-        const int pipegen_available_dram_io_space_per_stream = free_l1_space / num_dram_readers; // try /2 TODO
+        const int scale_for_n300_dataparallel = env_as<bool>("PYBUDA_N300_DATA_PARALLEL") ? 2 : 1;
+        const int pipegen_available_dram_io_space_per_stream = free_l1_space / num_dram_readers / scale_for_n300_dataparallel;
         int current_stream_available_dram_io_space = pipegen_available_dram_io_space_per_stream;
 
         for (std::size_t input_idx = 0; input_idx < operands.size(); ++input_idx)
@@ -1686,9 +1754,9 @@ BudaNetlist lower_to_buda_netlist(
         }
     }
 
-    vector<size_t> dp_epochs;
-    unordered_map<int, tt::placer::EpochInfo> epoch_info_map;
-    for (size_t epoch_id = 0; epoch_id < epoch_count; ++epoch_id)
+    vector<uint32_t> dp_epochs;
+    unordered_map<int, tt::placer::EpochInfo> epoch_info_map; // will replace placer_solution.epoch_id_to_epoch_info
+    for (uint32_t epoch_id = 0; epoch_id < epoch_count; ++epoch_id)
     {
         int chip_id = placer_solution.epoch_id_to_chip.at(epoch_id);
         bool is_dp_epoch = false;
@@ -1706,6 +1774,7 @@ BudaNetlist lower_to_buda_netlist(
             }
 
             auto epoch_info = placer_solution.epoch_id_to_epoch_info.at(epoch_id);
+            epoch_info.global_epoch_id += dp_epochs.size(); // number of inserted epochs so far
             epoch_info_map[epoch_id + dp_epochs.size()] = epoch_info;
 
             if (is_dp_epoch)
@@ -1719,15 +1788,17 @@ BudaNetlist lower_to_buda_netlist(
                 buda_graph.epoch_types.insert(buda_graph.epoch_types.begin() + epoch_id + dp_epochs.size(), buda_graph.epoch_types.at(epoch_id));
                 buda_graph.epoch_target_devices.push_back({BudaDevice(1)});
 
+                // create a new epoch_info for inserted epoch
                 epoch_info_map[epoch_id + dp_epochs.size()] = {
-                    .global_epoch_id = epoch_info.global_epoch_id,
-                    .temporal_epoch_id = epoch_info.temporal_epoch_id,
+                    .global_epoch_id = epoch_info.global_epoch_id + 1, // ensure global id is unique
+                    .temporal_epoch_id = epoch_info.temporal_epoch_id, // same as dp epoch
                     .spatial_epoch_id = 1,
                     .epoch_type = epoch_info.epoch_type
                 };
             }
             else
             {
+                // not a dp epoch, place it on both devices
                 buda_graph.epoch_target_devices.push_back({BudaDevice(0), BudaDevice(1)});
             }
         }

From 40126e7f932bc4497be6c6db9446bb42dc76c6aa Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Tue, 27 Aug 2024 15:02:49 +0000
Subject: [PATCH 105/116] Add feature override

(cherry picked from commit 077316e61b87f7f106f65cee6407e1a44e7f0c2a)
---
 README.debug.md                                              | 1 +
 pybuda/csrc/passes/commute_utils.cpp                         | 3 +++
 pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py | 1 +
 3 files changed, 5 insertions(+)

diff --git a/README.debug.md b/README.debug.md
index c0171c12..b76f58ff 100644
--- a/README.debug.md
+++ b/README.debug.md
@@ -120,6 +120,7 @@
  * PYBUDA\_ENABLE\_EMULATION\_DEVICE: This device is a specific silicon emulation device that PyBUDA supports. The variable is used to enable emulation device in PyBUDA pytest environment. By setting this variable to 1, we are instructing PyBUDA to use the emulation device as the target device instead of the silicon or golden device. Enabling the emulation device can be useful for testing or experimentation purposes, allowing us to evaluate the behaviour of our code on this emulation device. In order to run emulation device as a targeted device, the source code must be built with EMULATION_DEVICE_EN=1 environment variable.
  * PYBUDA\_EMULATION\_DEVICE\_ARCH: This env variable represents the architecture of the emulation device used in the pytest.
  * PYBUDA\_DISABLE\_DEPTHWISE\_CONV2D\_DECOMP: If set to 1, depthwise conv2d ops will not be decomposed using the depthwise op and instead use a matmul.
+ * PYBUDA\_DISABLE\_SINGLE\_REDUCE\_COMMUTEL If set to 1, reshapes will not be able to commute through single reduce operations, yet may still be allowed to commute through any back-to-back reduce ops. Enabling this may result in additional TMs remaining in the model.
 
  ## Golden overrides
  * GOLDEN\_WORMHOLE\_B0: run Golden with Wormhole_B0 as target device instead of Grayskull (default)
diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index de222279..2e0c986f 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -691,6 +691,9 @@ bool commute_through_reduce(
         
         if (not can_commute)
         {
+            if (env_as<bool>("PYBUDA_DISABLE_SINGLE_REDUCE_COMMUTE", "0")) {
+                return false;
+            }
             // auto can_comm_new_dim = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
             auto can_comm_new_dim = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);
             can_commute = std::get<0>(can_comm_new_dim);
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
index fa7ffdba..20dcd03b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
@@ -27,6 +27,7 @@ def test_pidnet_pytorch(variant, test_device):
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
 
     os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_SINGLE_REDUCE_COMMUTE"] = "1"
 
     # Load and pre-process image
     image_path = "third_party/confidential_customer_models/internal/pidnet/files/samples/road_scenes.png"

From 4b12394b3af835768575d02b001219a388919070 Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Wed, 28 Aug 2024 16:46:30 +0000
Subject: [PATCH 106/116] Bug in default value for env_as call

(cherry picked from commit 2b31c98dbc70a6e6e02fdf4ce95b74d73da23344)
---
 pybuda/csrc/passes/commute_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index 2e0c986f..8a95834c 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -691,7 +691,7 @@ bool commute_through_reduce(
         
         if (not can_commute)
         {
-            if (env_as<bool>("PYBUDA_DISABLE_SINGLE_REDUCE_COMMUTE", "0")) {
+            if (env_as<bool>("PYBUDA_DISABLE_SINGLE_REDUCE_COMMUTE", false)) {
                 return false;
             }
             // auto can_comm_new_dim = can_commute_through_dim(initial_op, graph, reduce_dim, commute_up);

From f1f2cfa7b0bda093a2302e29f020667bd76f44a4 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Wed, 28 Aug 2024 11:16:25 +0000
Subject: [PATCH 107/116] Add tests for token and sequence classification task
 for phi2(pytorch) in WHB0 and GS

(cherry picked from commit 827ed58a496754b28ae353bda2db482486f48a81)
---
 .../high_prio/nlp/pytorch/test_phi2.py        | 102 +++++++++++++++++-
 1 file changed, 100 insertions(+), 2 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
index e64a0dc7..6aa4846f 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
@@ -1,15 +1,18 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
 import pybuda
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
 from pybuda._C.backend_api import BackendType, BackendDevice
 import torch
-from transformers import PhiForCausalLM, AutoTokenizer, PhiConfig
+from transformers import PhiForCausalLM, AutoTokenizer, PhiConfig, PhiForTokenClassification, PhiForSequenceClassification
 import os
 import pytest
 
 # Masked fill kernal produced invalid results in Silicon BackendType
-# So Disabling the verification in BBE for Silicon BackendType
+# So Disabling the verification in BBE for Silicon BackendType for causal LM task
 # Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
 
 variants = ["microsoft/phi-2", "microsoft/phi-2-pytdml"]
@@ -71,3 +74,98 @@ def test_phi2_clm(test_device, variant):
             enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
         ),
     )
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi2_token_classification(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # PhiConfig from pretrained variant, disable return_dict and caching.
+    config = PhiConfig.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = PhiConfig(**config_dict)
+
+    # Load tokenizer and model from HuggingFace
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    model = PhiForTokenClassification.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+
+    # input_prompt
+    input_prompt = "HuggingFace is a company based in Paris and New York"
+
+    # Tokenize input
+    inputs = tokenizer(input_prompt, return_tensors="pt")
+
+    input_ids = inputs["input_ids"]
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    pcc = 0.99
+
+    if test_device.devtype == pybuda.BackendType.Silicon and test_device.arch == BackendDevice.Wormhole_B0:
+        pcc = 0.98
+    elif test_device.devtype == pybuda.BackendType.Silicon and test_device.arch == BackendDevice.Grayskull:
+        pcc = 0.97
+
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape,)],
+        inputs=[(input_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi2_sequence_classification(test_device, variant):
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # PhiConfig from pretrained variant, disable return_dict and caching.
+    config = PhiConfig.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config_dict["pad_token_id"] = None
+    config = PhiConfig(**config_dict)
+
+    # Load tokenizer and model from HuggingFace
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    model = PhiForSequenceClassification.from_pretrained(variant, trust_remote_code=True, config=config)
+    model.eval()
+
+    # input_prompt
+    input_prompt = "I am not satisfied with the quality of this product."
+
+    # Tokenize input
+    inputs = tokenizer(input_prompt, return_tensors="pt")
+
+    input_ids = inputs["input_ids"]
+
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape,)],
+        inputs=[(input_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )

From 71a8e56e5f01c75f42bdf764eda84a7aad1549cd Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Tue, 6 Aug 2024 22:19:53 +0000
Subject: [PATCH 108/116] Add test for phi2 onnx model

(cherry picked from commit b1f96f030d4b3a33616cd3940dfd397d8c16753f)
---
 .../high_prio/nlp/onnx/__init__.py            |  0
 .../high_prio/nlp/onnx/test_phi2.py           | 75 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 pybuda/test/model_demos/high_prio/nlp/onnx/__init__.py
 create mode 100644 pybuda/test/model_demos/high_prio/nlp/onnx/test_phi2.py

diff --git a/pybuda/test/model_demos/high_prio/nlp/onnx/__init__.py b/pybuda/test/model_demos/high_prio/nlp/onnx/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pybuda/test/model_demos/high_prio/nlp/onnx/test_phi2.py b/pybuda/test/model_demos/high_prio/nlp/onnx/test_phi2.py
new file mode 100644
index 00000000..526f5b08
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/nlp/onnx/test_phi2.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+import pytest
+import os
+import onnx
+from transformers import AutoTokenizer
+from pybuda._C.backend_api import BackendDevice
+
+# Masked fill kernal produced invalid results in Silicon BackendType 
+# Masked fill is converted to the Where operation after exporting the model to ONNX.
+# The Where operation also produces invalid results on the Silicon BackendType, similar to Masked fill.
+# So Disabling the verification in BBE for Silicon BackendType for causal LM task
+# Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2837
+
+variants = ["microsoft/phi-2", "microsoft/phi-2-pytdml"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_phi2_onnx(variant, test_device):
+
+    # pybuda Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "20480"
+
+    elif test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
+        os.environ["PYBUDA_DRAM_FLIP_FLOP"] = "1"
+
+    # load the Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(variant, return_tensors="pt", trust_remote_code=True)
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    # input_prompt
+    input_prompt = "Write a detailed analogy between mathematics and a lighthouse."
+
+    # Tokenize input
+    inputs = tokenizer(
+        input_prompt,
+        return_tensors="pt",
+        max_length=256,
+        pad_to_max_length=True,
+        truncation=True,
+    )
+
+    input_ids = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
+
+    variant_name = str(variant.split("/")[-1].replace("-", "_"))
+    model_name = f"onnx_{variant_name}"
+    load_path = f"third_party/confidential_customer_models/internal/phi2/files/onnx/{variant_name}/decoder_model.onnx"
+
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule(model_name, model, load_path)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape,attn_mask.shape,)],
+        inputs=[(input_ids,attn_mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
+        ),
+    )

From 66b20342953fb6479eb4cf3e8a5e570cbd5810a5 Mon Sep 17 00:00:00 2001
From: Sterling Taylor <staylor@tenstorrent.com>
Date: Wed, 28 Aug 2024 13:16:57 +0000
Subject: [PATCH 109/116] Update file api.rst

(cherry picked from commit 56f9f620d468aa073fc221146b7b93e2eb15f5da)
---
 docs/public/api.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/public/api.rst b/docs/public/api.rst
index 21de6a12..144994d7 100644
--- a/docs/public/api.rst
+++ b/docs/public/api.rst
@@ -24,7 +24,7 @@ Python Runtime API
 C++ Runtime API
 ******************
 
-The BUDA Backend used by Python Runtime can be optionally used stand-alone to run pre-compiled TTI models. The API reference for stand-alone BUDA Backend Runtime can be found `here <http://yyz-webservice-02.local.tenstorrent.com/docs/budabackend-docs/>`_.
+The BUDA Backend used by Python Runtime can be optionally used stand-alone to run pre-compiled TTI models.
 
 Configuration and Placement
 ***************************

From 5a602a18ab421f586143063bcd5e9c6b7a01b420 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 29 Aug 2024 16:51:55 +0000
Subject: [PATCH 110/116] Add microsoft/phi-3-mini-4k-instruct model for token
 and sequence classification task in GS

(cherry picked from commit 9e849c5c4b3e17d3ead9856647dacdb6de1a4b1c)
---
 .../high_prio/cnn/pytorch/test_bts.py         |  4 ++++
 .../high_prio/nlp/pytorch/test_phi3.py        | 22 ++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py
index 8209d16d..37b2ae02 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_bts.py
@@ -18,6 +18,10 @@
 
 from scripts.model import get_bts_model
 
+# Clip produces invalid results in Silicon BackendType
+# which leads to pcc drop in normalize op in BTS model
+# So Disabling the verification in BBE for Silicon BackendType
+# Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2823
 
 variants = ["densenet161_bts", "densenet121_bts"]
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
index 60e92be1..5336441a 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -8,8 +8,10 @@
 import pytest
 
 # Masked fill kernal produced invalid results in Silicon BackendType
-# So Disabling the verification in BBE for Silicon BackendType
 # Issue link - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2712
+# RMS block of phi3 produced different results on each run in GS when BBE is enabled
+# https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2838
+# So Disabling the verification in BBE for Silicon BackendType
 
 variants = ["microsoft/phi-3-mini-4k-instruct"]
 
@@ -63,8 +65,8 @@ def test_phi3_causal_lm(test_device, variant):
 
     verify_module(
         tt_model,
-        input_shapes=[input_ids.shape, attn_mask.shape],
-        inputs=[input_ids, attn_mask],
+        input_shapes=[(input_ids.shape, attn_mask.shape)],
+        inputs=[(input_ids, attn_mask)],
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
@@ -83,6 +85,10 @@ def test_phi3_token_classification(test_device, variant):
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
 
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
+        os.environ["PYBUDA_DRAM_FLIP_FLOP"] = "1"
+
     # Phi3Config from pretrained variant, disable return_dict and caching.
     config = Phi3Config.from_pretrained(variant)
     config_dict = config.to_dict()
@@ -115,6 +121,7 @@ def test_phi3_token_classification(test_device, variant):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
+            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
         ),
     )
 
@@ -127,6 +134,10 @@ def test_phi3_sequence_classification(test_device, variant):
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
 
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
+        os.environ["PYBUDA_DRAM_FLIP_FLOP"] = "1"
+
     # Phi3Config from pretrained variant, disable return_dict and caching.
     config = Phi3Config.from_pretrained(variant)
     config_dict = config.to_dict()
@@ -152,12 +163,13 @@ def test_phi3_sequence_classification(test_device, variant):
 
     verify_module(
         tt_model,
-        input_shapes=[(input_ids.shape)],
-        inputs=[(input_ids)],
+        input_shapes=[(input_ids.shape,)],
+        inputs=[(input_ids,)],
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
+            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
         ),
     )

From ecf83fa1e31e3da243d6c50b54d9b3519ed514bd Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Tue, 3 Sep 2024 12:27:30 +0000
Subject: [PATCH 111/116] Patch BTS image path failure in CI

(cherry picked from commit 1cbe5173ed95a21e7234bcc90f87a77d12dd8208)
---
 .../model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
index 44890104..be35b4e6 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -9,9 +9,8 @@
 import cv2
 import os
 import sys
-sys.path = list(set(sys.path + ["third_party/confidential_customer_models/internal/tri_basic_2/"]))
-
-from scripts.semseg import resnet34_semseg
+sys.path.append("third_party/confidential_customer_models/internal/tri_basic_2/scripts")
+from semseg_tri import resnet34_semseg
 
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig

From 7989725537d45396fb8d4666771b2a3dc1dba9af Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan <akannan@tenstorrent.com>
Date: Fri, 30 Aug 2024 09:46:02 -0400
Subject: [PATCH 112/116] Add model demos for Nbeats(pytorch) for wormhole and
 grayskull

(cherry picked from commit 4947ada4446f0ffe959277742372645e47a8014e)
---
 .../high_prio/cnn/pytorch/test_nbeats.py      | 121 ++++++++++++++++++
 .../high_prio/nlp/pytorch/test_phi3.py        |  14 +-
 python_env/core_requirements.txt              |   1 +
 3 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 pybuda/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py

diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py
new file mode 100644
index 00000000..78766054
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py
@@ -0,0 +1,121 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify import TestKind
+
+import sys
+
+sys.path.append("third_party/confidential_customer_models/internal/")
+
+from nbeats.scripts import get_electricity_dataset_input, NBeatsWithGenericBasis, NBeatsWithTrendBasis, NBeatsWithSeasonalityBasis
+
+
+def test_nbeats_with_seasonality_basis(test_device):
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+
+    x, x_mask = get_electricity_dataset_input()
+
+    pytorch_model = NBeatsWithSeasonalityBasis(
+        input_size=72,
+        output_size=24,
+        num_of_harmonics=1,
+        stacks=30,
+        layers=4,
+        layer_size=2048,
+    )
+    pytorch_model.eval()
+
+    # Create pybuda.PyTorchModule using the loaded Pytorch model
+    tt_model = pybuda.PyTorchModule("nbeats_seasonality", pytorch_model)
+
+    pcc = 0.99
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape, x_mask.shape)],
+        inputs=[(x, x_mask)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
+
+
+
+def test_nbeats_with_generic_basis(test_device):
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    x, x_mask = get_electricity_dataset_input()
+
+    pytorch_model = NBeatsWithGenericBasis(
+        input_size=72, output_size=24, stacks=30, layers=4, layer_size=512
+    )
+    pytorch_model.eval()
+
+    # Create pybuda.PyTorchModule using the loaded Pytorch model
+    tt_model = pybuda.PyTorchModule("nbeats_generic", pytorch_model)
+
+    pcc = 0.99
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape, x_mask.shape)],
+        inputs=[(x, x_mask)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
+
+
+def test_nbeats_with_trend_basis(test_device):
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    if test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+
+    x, x_mask = get_electricity_dataset_input()
+
+    pytorch_model = NBeatsWithTrendBasis(
+        input_size=72,
+        output_size=24,
+        degree_of_polynomial=3,
+        stacks=30,
+        layers=4,
+        layer_size=256,
+    )
+    pytorch_model.eval()
+
+    # Create pybuda.PyTorchModule using the loaded Pytorch model
+    tt_model = pybuda.PyTorchModule("nbeats_trend", pytorch_model)
+
+    pcc = 0.99
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape, x_mask.shape)],
+        inputs=[(x, x_mask)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
index 5336441a..1c2f8fee 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -112,6 +112,11 @@ def test_phi3_token_classification(test_device, variant):
 
     tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
 
+    enabled = True
+    if test_device.devtype == pybuda.BackendType.Silicon:
+        if test_device.arch == BackendDevice.Grayskull:
+            enabled = False
+
     verify_module(
         tt_model,
         input_shapes=[(input_ids.shape,)],
@@ -121,7 +126,7 @@ def test_phi3_token_classification(test_device, variant):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
+            enabled=enabled
         ),
     )
 
@@ -161,6 +166,11 @@ def test_phi3_sequence_classification(test_device, variant):
 
     tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
 
+    enabled = True
+    if test_device.devtype == pybuda.BackendType.Silicon:
+        if test_device.arch == BackendDevice.Grayskull:
+            enabled = False
+
     verify_module(
         tt_model,
         input_shapes=[(input_ids.shape,)],
@@ -170,6 +180,6 @@ def test_phi3_sequence_classification(test_device, variant):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            enabled=False if test_device.devtype == pybuda.BackendType.Silicon else True,
+            enabled=enabled,
         ),
     )
diff --git a/python_env/core_requirements.txt b/python_env/core_requirements.txt
index 0442bbb0..24900140 100644
--- a/python_env/core_requirements.txt
+++ b/python_env/core_requirements.txt
@@ -46,3 +46,4 @@ tflite==2.10.0
 ultralytics==8.0.145
 keras==2.13.1
 pytorch_forecasting==1.0.0
+patool

From 3ef46b90693c417febd8234fb596f5c9cc635aff Mon Sep 17 00:00:00 2001
From: Olof Johansson <olofj@tenstorrent.com>
Date: Thu, 5 Sep 2024 07:58:00 +0000
Subject: [PATCH 113/116] LICENSE: Whitespace cleanups and additions of
 references to sfpi-gcc licenses

(cherry picked from commit 00225cfb32510d73c62baae73a11438f7d52fdda)
---
 LICENCE => LICENSE | 112 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 96 insertions(+), 16 deletions(-)
 rename LICENCE => LICENSE (81%)

diff --git a/LICENCE b/LICENSE
similarity index 81%
rename from LICENCE
rename to LICENSE
index 053f2cac..e97a525c 100644
--- a/LICENCE
+++ b/LICENSE
@@ -156,43 +156,118 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
    END OF TERMS AND CONDITIONS
+
 ---------------------------------------------------------------------------
+
 Third-Party Dependencies:
+
+The following dependencies are utilized by this project and are included
+in a distributed build of a Python Wheel:
+
+- sfpi-gcc - https://github.com/tenstorrent-metal/sfpi-rel/blob/master/LICENSE https://github.com/tenstorrent-metal/sfpi-rel/blob/master/compiler/LICENSE
+
 The following dependencies are utilized by this project but are not explicitly
 distributed as part of the software:
+
 - boost - https://www.boost.org/LICENSE_1_0.txt
 - googletest - https://github.com/google/googletest/blob/main/LICENSE
+
 ---------------------------------------------------------------------------
+
 Third-Party Sources:
-The following are licenses used by third-party sources which are distributed
-as part of the software, as identified within the software comments. The full text
-of the listed licenses are reproduced below:
+
+The following are licenses used by third-party sources which are
+distributed as part of the software, as identified within the software
+comments. The full text of the listed licenses are reproduced below:
+
 - Apache 2.0 (as above)
+
 - BSD-3
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 - BSD
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 - MIT
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+“Software”), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
+NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 - caffe2
+
 Caffe2 uses a copyright model similar to Caffe: each contributor holds
 copyright over their contributions to Caffe2. The project versioning records
 all such contribution and copyright details. If a contributor wants to further
 mark their specific copyright on a particular contribution, they should
 indicate their copyright solely in the commit message of the change when it is
 committed.
+
 All rights reserved.
+
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
+
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
@@ -202,6 +277,7 @@ modification, are permitted provided that the following conditions are met:
    and IDIAP Research Institute nor the names of its contributors may be
    used to endorse or promote products derived from this software without
    specific prior written permission.
+
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -213,23 +289,27 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
+
 - boost
+
 Permission is hereby granted, free of charge, to any person or organization
 obtaining a copy of the software and accompanying documentation covered by
 this license (the "Software") to use, reproduce, display, distribute,
 execute, and transmit the Software, and to prepare derivative works of the
 Software, and to permit third-parties to whom the Software is furnished to
 do so, all subject to the following:
+
 The copyright notices in the Software and this entire statement, including
 the above license grant, this restriction and the following disclaimer,
 must be included in all copies of the Software, in whole or in part, and
 all derivative works of the Software, unless such copies or derivative
 works are solely in the form of machine-executable object code generated by
 a source language processor.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
\ No newline at end of file
+DEALINGS IN THE SOFTWARE.

From 0df57b3d72c9b15a0fbf87c993c15c0b2db29bc5 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Thu, 5 Sep 2024 09:51:05 +0000
Subject: [PATCH 114/116] Update submodule

---
 third_party/budabackend | 2 +-
 third_party/tvm         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/budabackend b/third_party/budabackend
index e3af07f5..4f1bc0d4 160000
--- a/third_party/budabackend
+++ b/third_party/budabackend
@@ -1 +1 @@
-Subproject commit e3af07f5059d026a1c2d839254197fda7bad15ef
+Subproject commit 4f1bc0d417d5a6b7a6c52a5300608a9948528d6b
diff --git a/third_party/tvm b/third_party/tvm
index c748d645..95442362 160000
--- a/third_party/tvm
+++ b/third_party/tvm
@@ -1 +1 @@
-Subproject commit c748d64552cd86cc3406511366d96ae8ae866ecd
+Subproject commit 95442362d691d9983a1779d1a4af359784e5bc73

From 1dea9a76c8e70f324df1d6508b88410d54920c6f Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <vmilosevic@tenstorrent.com>
Date: Mon, 9 Sep 2024 08:55:27 +0000
Subject: [PATCH 115/116] Update tt-budabackend submodule

---
 third_party/budabackend | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/budabackend b/third_party/budabackend
index 4f1bc0d4..58fab9cd 160000
--- a/third_party/budabackend
+++ b/third_party/budabackend
@@ -1 +1 @@
-Subproject commit 4f1bc0d417d5a6b7a6c52a5300608a9948528d6b
+Subproject commit 58fab9cd7ac53176363fc3ee61d40f434778c964

From 1e949f50075591b7c66e565951bf68c4ff1b0a69 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olofj@tenstorrent.com>
Date: Mon, 9 Sep 2024 08:30:36 +0000
Subject: [PATCH 116/116] LICENSE: refine wording

(cherry picked from commit 6adecc286c635d9ddc36bb72fc435e60df911537)
---
 LICENSE | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/LICENSE b/LICENSE
index e97a525c..e131f6e9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -161,8 +161,9 @@
 
 Third-Party Dependencies:
 
-The following dependencies are utilized by this project and are included
-in a distributed build of a Python Wheel:
+The following separate and independent dependencies are utilized by this
+project and are included in a distributed build of a Python Wheel and
+are subject to their own license terms listed as follows:
 
 - sfpi-gcc - https://github.com/tenstorrent-metal/sfpi-rel/blob/master/LICENSE https://github.com/tenstorrent-metal/sfpi-rel/blob/master/compiler/LICENSE